1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60     KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68     KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84                                   int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86                                   kmp_internal_control_t *new_icvs,
87                                   ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90                                    int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96                           kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113   int i;
114   kmp_info_t **other_threads;
115   size_t stack_data;
116   char *stack_addr;
117   size_t stack_size;
118   char *stack_base;
119 
120   KA_TRACE(
121       1000,
122       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
123        __kmp_nth, __kmp_all_nth));
124 
125   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128      __kmp_init_gtid for this to work. */
129 
130   if (!TCR_4(__kmp_init_gtid))
131     return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134   if (TCR_4(__kmp_gtid_mode) >= 3) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136     return __kmp_gtid;
137   }
138 #endif
139   if (TCR_4(__kmp_gtid_mode) >= 2) {
140     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141     return __kmp_gtid_get_specific();
142   }
143   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145   stack_addr = (char *)&stack_data;
146   other_threads = __kmp_threads;
147 
148   /* ATT: The code below is a source of potential bugs due to unsynchronized
149      access to __kmp_threads array. For example:
150      1. Current thread loads other_threads[i] to thr and checks it, it is
151         non-NULL.
152      2. Current thread is suspended by OS.
153      3. Another thread unregisters and finishes (debug versions of free()
154         may fill memory with something like 0xEF).
155      4. Current thread is resumed.
156      5. Current thread reads junk from *thr.
157      TODO: Fix it.  --ln  */
158 
159   for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162     if (!thr)
163       continue;
164 
165     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168     /* stack grows down -- search through all of the active threads */
169 
170     if (stack_addr <= stack_base) {
171       size_t stack_diff = stack_base - stack_addr;
172 
173       if (stack_diff <= stack_size) {
174         /* The only way we can be closer than the allocated */
175         /* stack size is if we are running on this thread. */
176         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177         return i;
178       }
179     }
180   }
181 
182   /* get specific to try and determine our gtid */
183   KA_TRACE(1000,
184            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185             "thread, using TLS\n"));
186   i = __kmp_gtid_get_specific();
187 
188   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
189 
190   /* if we havn't been assigned a gtid, then return code */
191   if (i < 0)
192     return i;
193 
194   /* dynamically updated stack window for uber threads to avoid get_specific
195      call */
196   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197     KMP_FATAL(StackOverflow, i);
198   }
199 
200   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201   if (stack_addr > stack_base) {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205                 stack_base);
206   } else {
207     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208             stack_base - stack_addr);
209   }
210 
211   /* Reprint stack bounds for ubermaster since they have been refined */
212   if (__kmp_storage_map) {
213     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216                                  other_threads[i]->th.th_info.ds.ds_stacksize,
217                                  "th_%d stack (refinement)", i);
218   }
219   return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223   int gtid;
224 
225   if (!__kmp_init_serial) {
226     gtid = KMP_GTID_DNE;
227   } else
228 #ifdef KMP_TDATA_GTID
229       if (TCR_4(__kmp_gtid_mode) >= 3) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231     gtid = __kmp_gtid;
232   } else
233 #endif
234       if (TCR_4(__kmp_gtid_mode) >= 2) {
235     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236     gtid = __kmp_gtid_get_specific();
237   } else {
238     KA_TRACE(1000,
239              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240     gtid = __kmp_get_global_thread_id();
241   }
242 
243   /* we must be a new uber master sibling thread */
244   if (gtid == KMP_GTID_DNE) {
245     KA_TRACE(10,
246              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247               "Registering a new gtid.\n"));
248     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249     if (!__kmp_init_serial) {
250       __kmp_do_serial_initialize();
251       gtid = __kmp_gtid_get_specific();
252     } else {
253       gtid = __kmp_register_root(FALSE);
254     }
255     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257   }
258 
259   KMP_DEBUG_ASSERT(gtid >= 0);
260 
261   return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266   int f;
267   char *stack_beg = NULL;
268   char *stack_end = NULL;
269   int gtid;
270 
271   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272   if (__kmp_storage_map) {
273     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276     gtid = __kmp_gtid_from_thread(th);
277 
278     if (gtid == KMP_GTID_MONITOR) {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%s stack (%s)", "mon",
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     } else {
284       __kmp_print_storage_map_gtid(
285           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286           "th_%d stack (%s)", gtid,
287           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288     }
289   }
290 
291   /* No point in checking ubermaster threads since they use refinement and
292    * cannot overlap */
293   gtid = __kmp_gtid_from_thread(th);
294   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295     KA_TRACE(10,
296              ("__kmp_check_stack_overlap: performing extensive checking\n"));
297     if (stack_beg == NULL) {
298       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300     }
301 
302     for (f = 0; f < __kmp_threads_capacity; f++) {
303       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305       if (f_th && f_th != th) {
306         char *other_stack_end =
307             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308         char *other_stack_beg =
309             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313           /* Print the other stack values before the abort */
314           if (__kmp_storage_map)
315             __kmp_print_storage_map_gtid(
316                 -1, other_stack_beg, other_stack_end,
317                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321                       __kmp_msg_null);
322         }
323       }
324     }
325   }
326   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332   static int done = FALSE;
333 
334   while (!done) {
335     KMP_YIELD(TRUE);
336   }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342                                   char const *format, ...) {
343   char buffer[MAX_MESSAGE];
344   va_list ap;
345 
346   va_start(ap, format);
347   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348                p2, (unsigned long)size, format);
349   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350   __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352   int node;
353   if (gtid >= 0) {
354     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355       if (__kmp_storage_map_verbose) {
356         node = __kmp_get_host_node(p1);
357         if (node < 0) /* doesn't work, so don't try this next time */
358           __kmp_storage_map_verbose = FALSE;
359         else {
360           char *last;
361           int lastNode;
362           int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364           const int page_size = KMP_GET_PAGE_SIZE();
365 
366           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368           if (localProc >= 0)
369             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
370                                  localProc >> 1);
371           else
372             __kmp_printf_no_lock("  GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374           /* The more elaborate format is disabled for now because of the prctl
375            * hanging bug. */
376           do {
377             last = p1;
378             lastNode = node;
379             /* This loop collates adjacent pages with the same host node. */
380             do {
381               (char *)p1 += page_size;
382             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
384                                  lastNode);
385           } while (p1 <= p2);
386 #else
387           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
388                                (char *)p1 + (page_size - 1),
389                                __kmp_get_host_node(p1));
390           if (p1 < p2) {
391             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
392                                  (char *)p2 + (page_size - 1),
393                                  __kmp_get_host_node(p2));
394           }
395 #endif
396         }
397       }
398     } else
399       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
400   }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406   char buffer[MAX_MESSAGE];
407   va_list ap;
408 
409   if (__kmp_generate_warnings == kmp_warnings_off) {
410     return;
411   }
412 
413   va_start(ap, format);
414 
415   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417   __kmp_vprintf(kmp_err, buffer, ap);
418   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420   va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424   // Later threads may stall here, but that's ok because abort() will kill them.
425   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427   if (__kmp_debug_buf) {
428     __kmp_dump_debug_buffer();
429   }
430 
431   if (KMP_OS_WINDOWS) {
432     // Let other threads know of abnormal termination and prevent deadlock
433     // if abort happened during library initialization or shutdown
434     __kmp_global.g.g_abort = SIGABRT;
435 
436     /* On Windows* OS by default abort() causes pop-up error box, which stalls
437        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438        boxes. _set_abort_behavior() works well, but this function is not
439        available in VS7 (this is not problem for DLL, but it is a problem for
440        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441        help, at least in some versions of MS C RTL.
442 
443        It seems following sequence is the only way to simulate abort() and
444        avoid pop-up error box. */
445     raise(SIGABRT);
446     _exit(3); // Just in case, if signal ignored, exit anyway.
447   } else {
448     __kmp_unregister_library();
449     abort();
450   }
451 
452   __kmp_infinite_loop();
453   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458   // TODO: Eliminate g_abort global variable and this function.
459   // In case of abort just call abort(), it will kill all the threads.
460   __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464    that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468                                gtid);
469 
470   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476   __kmp_print_storage_map_gtid(
477       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481                                &thr->th.th_bar[bs_plain_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483                                gtid);
484 
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488                                gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492                                &thr->th.th_bar[bs_reduction_barrier + 1],
493                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494                                gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499    that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502                                          int team_id, int num_thr) {
503   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505                                header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508                                &team->t.t_bar[bs_last_barrier],
509                                sizeof(kmp_balign_team_t) * bs_last_barrier,
510                                "%s_%d.t_bar", header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513                                &team->t.t_bar[bs_plain_barrier + 1],
514                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515                                header, team_id);
516 
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518                                &team->t.t_bar[bs_forkjoin_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524                                &team->t.t_bar[bs_reduction_barrier + 1],
525                                sizeof(kmp_balign_team_t),
526                                "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533   __kmp_print_storage_map_gtid(
534       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538                                &team->t.t_disp_buffer[num_disp_buff],
539                                sizeof(dispatch_shared_info_t) * num_disp_buff,
540                                "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
552   // TODO: Change to __kmp_break_bootstrap_lock().
553   __kmp_init_bootstrap_lock(lck); // make the lock released
554 }
555 
556 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
557   int i;
558   int thread_count;
559 
560   // PROCESS_DETACH is expected to be called by a thread that executes
561   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
562   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
563   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
564   // threads can be still alive here, although being about to be terminated. The
565   // threads in the array with ds_thread==0 are most suspicious. Actually, it
566   // can be not safe to access the __kmp_threads[].
567 
568   // TODO: does it make sense to check __kmp_roots[] ?
569 
570   // Let's check that there are no other alive threads registered with the OMP
571   // lib.
572   while (1) {
573     thread_count = 0;
574     for (i = 0; i < __kmp_threads_capacity; ++i) {
575       if (!__kmp_threads)
576         continue;
577       kmp_info_t *th = __kmp_threads[i];
578       if (th == NULL)
579         continue;
580       int gtid = th->th.th_info.ds.ds_gtid;
581       if (gtid == gtid_req)
582         continue;
583       if (gtid < 0)
584         continue;
585       DWORD exit_val;
586       int alive = __kmp_is_thread_alive(th, &exit_val);
587       if (alive) {
588         ++thread_count;
589       }
590     }
591     if (thread_count == 0)
592       break; // success
593   }
594 
595   // Assume that I'm alone. Now it might be safe to check and reset locks.
596   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
597   __kmp_reset_lock(&__kmp_forkjoin_lock);
598 #ifdef KMP_DEBUG
599   __kmp_reset_lock(&__kmp_stdio_lock);
600 #endif // KMP_DEBUG
601 }
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606   switch (fdwReason) {
607 
608   case DLL_PROCESS_ATTACH:
609     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611     return TRUE;
612 
613   case DLL_PROCESS_DETACH:
614     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616     if (lpReserved != NULL) {
617       // lpReserved is used for telling the difference:
618       //   lpReserved == NULL when FreeLibrary() was called,
619       //   lpReserved != NULL when the process terminates.
620       // When FreeLibrary() is called, worker threads remain alive. So they will
621       // release the forkjoin lock by themselves. When the process terminates,
622       // worker threads disappear triggering the problem of unreleased forkjoin
623       // lock as described below.
624 
625       // A worker thread can take the forkjoin lock. The problem comes up if
626       // that worker thread becomes dead before it releases the forkjoin lock.
627       // The forkjoin lock remains taken, while the thread executing
628       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
629       // to take the forkjoin lock and will always fail, so that the application
630       // will never finish [normally]. This scenario is possible if
631       // __kmpc_end() has not been executed. It looks like it's not a corner
632       // case, but common cases:
633       // - the main function was compiled by an alternative compiler;
634       // - the main function was compiled by icl but without /Qopenmp
635       //   (application with plugins);
636       // - application terminates by calling C exit(), Fortran CALL EXIT() or
637       //   Fortran STOP.
638       // - alive foreign thread prevented __kmpc_end from doing cleanup.
639       //
640       // This is a hack to work around the problem.
641       // TODO: !!! figure out something better.
642       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
643     }
644 
645     __kmp_internal_end_library(__kmp_gtid_get_specific());
646 
647     return TRUE;
648 
649   case DLL_THREAD_ATTACH:
650     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
651 
652     /* if we want to register new siblings all the time here call
653      * __kmp_get_gtid(); */
654     return TRUE;
655 
656   case DLL_THREAD_DETACH:
657     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
658 
659     __kmp_internal_end_thread(__kmp_gtid_get_specific());
660     return TRUE;
661   }
662 
663   return TRUE;
664 }
665 
666 #endif /* KMP_OS_WINDOWS */
667 #endif /* KMP_DYNAMIC_LIB */
668 
669 /* __kmp_parallel_deo -- Wait until it's our turn. */
670 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
671   int gtid = *gtid_ref;
672 #ifdef BUILD_PARALLEL_ORDERED
673   kmp_team_t *team = __kmp_team_from_gtid(gtid);
674 #endif /* BUILD_PARALLEL_ORDERED */
675 
676   if (__kmp_env_consistency_check) {
677     if (__kmp_threads[gtid]->th.th_root->r.r_active)
678 #if KMP_USE_DYNAMIC_LOCK
679       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
680 #else
681       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
682 #endif
683   }
684 #ifdef BUILD_PARALLEL_ORDERED
685   if (!team->t.t_serialized) {
686     KMP_MB();
687     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
688              NULL);
689     KMP_MB();
690   }
691 #endif /* BUILD_PARALLEL_ORDERED */
692 }
693 
694 /* __kmp_parallel_dxo -- Signal the next task. */
695 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696   int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698   int tid = __kmp_tid_from_gtid(gtid);
699   kmp_team_t *team = __kmp_team_from_gtid(gtid);
700 #endif /* BUILD_PARALLEL_ORDERED */
701 
702   if (__kmp_env_consistency_check) {
703     if (__kmp_threads[gtid]->th.th_root->r.r_active)
704       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
705   }
706 #ifdef BUILD_PARALLEL_ORDERED
707   if (!team->t.t_serialized) {
708     KMP_MB(); /* Flush all pending memory write invalidates.  */
709 
710     /* use the tid of the next thread in this team */
711     /* TODO replace with general release procedure */
712     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
713 
714     KMP_MB(); /* Flush all pending memory write invalidates.  */
715   }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* ------------------------------------------------------------------------ */
720 /* The BARRIER for a SINGLE process section is always explicit   */
721 
722 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
723   int status;
724   kmp_info_t *th;
725   kmp_team_t *team;
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729   __kmp_resume_if_soft_paused();
730 
731   th = __kmp_threads[gtid];
732   team = th->th.th_team;
733   status = 0;
734 
735   th->th.th_ident = id_ref;
736 
737   if (team->t.t_serialized) {
738     status = 1;
739   } else {
740     kmp_int32 old_this = th->th.th_local.this_construct;
741 
742     ++th->th.th_local.this_construct;
743     /* try to set team count to thread count--success means thread got the
744        single block */
745     /* TODO: Should this be acquire or release? */
746     if (team->t.t_construct == old_this) {
747       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
748                                               th->th.th_local.this_construct);
749     }
750 #if USE_ITT_BUILD
751     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
752         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
753         team->t.t_active_level ==
754             1) { // Only report metadata by master of active team at level 1
755       __kmp_itt_metadata_single(id_ref);
756     }
757 #endif /* USE_ITT_BUILD */
758   }
759 
760   if (__kmp_env_consistency_check) {
761     if (status && push_ws) {
762       __kmp_push_workshare(gtid, ct_psingle, id_ref);
763     } else {
764       __kmp_check_workshare(gtid, ct_psingle, id_ref);
765     }
766   }
767 #if USE_ITT_BUILD
768   if (status) {
769     __kmp_itt_single_start(gtid);
770   }
771 #endif /* USE_ITT_BUILD */
772   return status;
773 }
774 
775 void __kmp_exit_single(int gtid) {
776 #if USE_ITT_BUILD
777   __kmp_itt_single_end(gtid);
778 #endif /* USE_ITT_BUILD */
779   if (__kmp_env_consistency_check)
780     __kmp_pop_workshare(gtid, ct_psingle, NULL);
781 }
782 
783 /* determine if we can go parallel or must use a serialized parallel region and
784  * how many threads we can use
785  * set_nproc is the number of threads requested for the team
786  * returns 0 if we should serialize or only use one thread,
787  * otherwise the number of threads to use
788  * The forkjoin lock is held by the caller. */
789 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
790                                  int master_tid, int set_nthreads,
791                                  int enter_teams) {
792   int capacity;
793   int new_nthreads;
794   KMP_DEBUG_ASSERT(__kmp_init_serial);
795   KMP_DEBUG_ASSERT(root && parent_team);
796   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
797 
798   // If dyn-var is set, dynamically adjust the number of desired threads,
799   // according to the method specified by dynamic_mode.
800   new_nthreads = set_nthreads;
801   if (!get__dynamic_2(parent_team, master_tid)) {
802     ;
803   }
804 #ifdef USE_LOAD_BALANCE
805   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
806     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
807     if (new_nthreads == 1) {
808       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
809                     "reservation to 1 thread\n",
810                     master_tid));
811       return 1;
812     }
813     if (new_nthreads < set_nthreads) {
814       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
815                     "reservation to %d threads\n",
816                     master_tid, new_nthreads));
817     }
818   }
819 #endif /* USE_LOAD_BALANCE */
820   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
821     new_nthreads = __kmp_avail_proc - __kmp_nth +
822                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
823     if (new_nthreads <= 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     } else {
834       new_nthreads = set_nthreads;
835     }
836   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
837     if (set_nthreads > 2) {
838       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
839       new_nthreads = (new_nthreads % set_nthreads) + 1;
840       if (new_nthreads == 1) {
841         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
842                       "reservation to 1 thread\n",
843                       master_tid));
844         return 1;
845       }
846       if (new_nthreads < set_nthreads) {
847         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
848                       "reservation to %d threads\n",
849                       master_tid, new_nthreads));
850       }
851     }
852   } else {
853     KMP_ASSERT(0);
854   }
855 
856   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
857   if (__kmp_nth + new_nthreads -
858           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859       __kmp_max_nth) {
860     int tl_nthreads = __kmp_max_nth - __kmp_nth +
861                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
862     if (tl_nthreads <= 0) {
863       tl_nthreads = 1;
864     }
865 
866     // If dyn-var is false, emit a 1-time warning.
867     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
868       __kmp_reserve_warn = 1;
869       __kmp_msg(kmp_ms_warning,
870                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
871                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
872     }
873     if (tl_nthreads == 1) {
874       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
875                     "reduced reservation to 1 thread\n",
876                     master_tid));
877       return 1;
878     }
879     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
880                   "reservation to %d threads\n",
881                   master_tid, tl_nthreads));
882     new_nthreads = tl_nthreads;
883   }
884 
885   // Respect OMP_THREAD_LIMIT
886   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
887   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
888   if (cg_nthreads + new_nthreads -
889           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
890       max_cg_threads) {
891     int tl_nthreads = max_cg_threads - cg_nthreads +
892                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
893     if (tl_nthreads <= 0) {
894       tl_nthreads = 1;
895     }
896 
897     // If dyn-var is false, emit a 1-time warning.
898     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
899       __kmp_reserve_warn = 1;
900       __kmp_msg(kmp_ms_warning,
901                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
902                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
903     }
904     if (tl_nthreads == 1) {
905       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
906                     "reduced reservation to 1 thread\n",
907                     master_tid));
908       return 1;
909     }
910     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
911                   "reservation to %d threads\n",
912                   master_tid, tl_nthreads));
913     new_nthreads = tl_nthreads;
914   }
915 
916   // Check if the threads array is large enough, or needs expanding.
917   // See comment in __kmp_register_root() about the adjustment if
918   // __kmp_threads[0] == NULL.
919   capacity = __kmp_threads_capacity;
920   if (TCR_PTR(__kmp_threads[0]) == NULL) {
921     --capacity;
922   }
923   if (__kmp_nth + new_nthreads -
924           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
925       capacity) {
926     // Expand the threads array.
927     int slotsRequired = __kmp_nth + new_nthreads -
928                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
929                         capacity;
930     int slotsAdded = __kmp_expand_threads(slotsRequired);
931     if (slotsAdded < slotsRequired) {
932       // The threads array was not expanded enough.
933       new_nthreads -= (slotsRequired - slotsAdded);
934       KMP_ASSERT(new_nthreads >= 1);
935 
936       // If dyn-var is false, emit a 1-time warning.
937       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
938         __kmp_reserve_warn = 1;
939         if (__kmp_tp_cached) {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
943                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
944         } else {
945           __kmp_msg(kmp_ms_warning,
946                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
947                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
948         }
949       }
950     }
951   }
952 
953 #ifdef KMP_DEBUG
954   if (new_nthreads == 1) {
955     KC_TRACE(10,
956              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
957               "dead roots and rechecking; requested %d threads\n",
958               __kmp_get_gtid(), set_nthreads));
959   } else {
960     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
961                   " %d threads\n",
962                   __kmp_get_gtid(), new_nthreads, set_nthreads));
963   }
964 #endif // KMP_DEBUG
965   return new_nthreads;
966 }
967 
968 /* Allocate threads from the thread pool and assign them to the new team. We are
969    assured that there are enough threads available, because we checked on that
970    earlier within critical section forkjoin */
971 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
972                                     kmp_info_t *master_th, int master_gtid) {
973   int i;
974   int use_hot_team;
975 
976   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
977   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
978   KMP_MB();
979 
980   /* first, let's setup the master thread */
981   master_th->th.th_info.ds.ds_tid = 0;
982   master_th->th.th_team = team;
983   master_th->th.th_team_nproc = team->t.t_nproc;
984   master_th->th.th_team_master = master_th;
985   master_th->th.th_team_serialized = FALSE;
986   master_th->th.th_dispatch = &team->t.t_dispatch[0];
987 
988 /* make sure we are not the optimized hot team */
989 #if KMP_NESTED_HOT_TEAMS
990   use_hot_team = 0;
991   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
992   if (hot_teams) { // hot teams array is not allocated if
993     // KMP_HOT_TEAMS_MAX_LEVEL=0
994     int level = team->t.t_active_level - 1; // index in array of hot teams
995     if (master_th->th.th_teams_microtask) { // are we inside the teams?
996       if (master_th->th.th_teams_size.nteams > 1) {
997         ++level; // level was not increased in teams construct for
998         // team_of_masters
999       }
1000       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1001           master_th->th.th_teams_level == team->t.t_level) {
1002         ++level; // level was not increased in teams construct for
1003         // team_of_workers before the parallel
1004       } // team->t.t_level will be increased inside parallel
1005     }
1006     if (level < __kmp_hot_teams_max_level) {
1007       if (hot_teams[level].hot_team) {
1008         // hot team has already been allocated for given level
1009         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1010         use_hot_team = 1; // the team is ready to use
1011       } else {
1012         use_hot_team = 0; // AC: threads are not allocated yet
1013         hot_teams[level].hot_team = team; // remember new hot team
1014         hot_teams[level].hot_team_nth = team->t.t_nproc;
1015       }
1016     } else {
1017       use_hot_team = 0;
1018     }
1019   }
1020 #else
1021   use_hot_team = team == root->r.r_hot_team;
1022 #endif
1023   if (!use_hot_team) {
1024 
1025     /* install the master thread */
1026     team->t.t_threads[0] = master_th;
1027     __kmp_initialize_info(master_th, team, 0, master_gtid);
1028 
1029     /* now, install the worker threads */
1030     for (i = 1; i < team->t.t_nproc; i++) {
1031 
1032       /* fork or reallocate a new thread and install it in team */
1033       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1034       team->t.t_threads[i] = thr;
1035       KMP_DEBUG_ASSERT(thr);
1036       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1037       /* align team and thread arrived states */
1038       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1039                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1040                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1041                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1042                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1043                     team->t.t_bar[bs_plain_barrier].b_arrived));
1044       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1045       thr->th.th_teams_level = master_th->th.th_teams_level;
1046       thr->th.th_teams_size = master_th->th.th_teams_size;
1047       { // Initialize threads' barrier data.
1048         int b;
1049         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1050         for (b = 0; b < bs_last_barrier; ++b) {
1051           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1052           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1053 #if USE_DEBUGGER
1054           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1055 #endif
1056         }
1057       }
1058     }
1059 
1060 #if KMP_AFFINITY_SUPPORTED
1061     __kmp_partition_places(team);
1062 #endif
1063   }
1064 
1065   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1066     for (i = 0; i < team->t.t_nproc; i++) {
1067       kmp_info_t *thr = team->t.t_threads[i];
1068       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1069           thr->th.th_prev_level != team->t.t_level) {
1070         team->t.t_display_affinity = 1;
1071         break;
1072       }
1073     }
1074   }
1075 
1076   KMP_MB();
1077 }
1078 
1079 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1080 // Propagate any changes to the floating point control registers out to the team
1081 // We try to avoid unnecessary writes to the relevant cache line in the team
1082 // structure, so we don't make changes unless they are needed.
1083 inline static void propagateFPControl(kmp_team_t *team) {
1084   if (__kmp_inherit_fp_control) {
1085     kmp_int16 x87_fpu_control_word;
1086     kmp_uint32 mxcsr;
1087 
1088     // Get master values of FPU control flags (both X87 and vector)
1089     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090     __kmp_store_mxcsr(&mxcsr);
1091     mxcsr &= KMP_X86_MXCSR_MASK;
1092 
1093     // There is no point looking at t_fp_control_saved here.
1094     // If it is TRUE, we still have to update the values if they are different
1095     // from those we now have. If it is FALSE we didn't save anything yet, but
1096     // our objective is the same. We have to ensure that the values in the team
1097     // are the same as those we have.
1098     // So, this code achieves what we need whether or not t_fp_control_saved is
1099     // true. By checking whether the value needs updating we avoid unnecessary
1100     // writes that would put the cache-line into a written state, causing all
1101     // threads in the team to have to read it again.
1102     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1103     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1104     // Although we don't use this value, other code in the runtime wants to know
1105     // whether it should restore them. So we must ensure it is correct.
1106     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1107   } else {
1108     // Similarly here. Don't write to this cache-line in the team structure
1109     // unless we have to.
1110     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1111   }
1112 }
1113 
1114 // Do the opposite, setting the hardware registers to the updated values from
1115 // the team.
1116 inline static void updateHWFPControl(kmp_team_t *team) {
1117   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1118     // Only reset the fp control regs if they have been changed in the team.
1119     // the parallel region that we are exiting.
1120     kmp_int16 x87_fpu_control_word;
1121     kmp_uint32 mxcsr;
1122     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1123     __kmp_store_mxcsr(&mxcsr);
1124     mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1127       __kmp_clear_x87_fpu_status_word();
1128       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1129     }
1130 
1131     if (team->t.t_mxcsr != mxcsr) {
1132       __kmp_load_mxcsr(&team->t.t_mxcsr);
1133     }
1134   }
1135 }
1136 #else
1137 #define propagateFPControl(x) ((void)0)
1138 #define updateHWFPControl(x) ((void)0)
1139 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1140 
1141 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1142                                      int realloc); // forward declaration
1143 
1144 /* Run a parallel region that has been serialized, so runs only in a team of the
1145    single master thread. */
1146 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1147   kmp_info_t *this_thr;
1148   kmp_team_t *serial_team;
1149 
1150   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1151 
1152   /* Skip all this code for autopar serialized loops since it results in
1153      unacceptable overhead */
1154   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1155     return;
1156 
1157   if (!TCR_4(__kmp_init_parallel))
1158     __kmp_parallel_initialize();
1159   __kmp_resume_if_soft_paused();
1160 
1161   this_thr = __kmp_threads[global_tid];
1162   serial_team = this_thr->th.th_serial_team;
1163 
1164   /* utilize the serialized team held by this thread */
1165   KMP_DEBUG_ASSERT(serial_team);
1166   KMP_MB();
1167 
1168   if (__kmp_tasking_mode != tskm_immediate_exec) {
1169     KMP_DEBUG_ASSERT(
1170         this_thr->th.th_task_team ==
1171         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1172     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1173                      NULL);
1174     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1175                   "team %p, new task_team = NULL\n",
1176                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1177     this_thr->th.th_task_team = NULL;
1178   }
1179 
1180   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1181   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1182     proc_bind = proc_bind_false;
1183   } else if (proc_bind == proc_bind_default) {
1184     // No proc_bind clause was specified, so use the current value
1185     // of proc-bind-var for this parallel region.
1186     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1187   }
1188   // Reset for next parallel region
1189   this_thr->th.th_set_proc_bind = proc_bind_default;
1190 
1191 #if OMPT_SUPPORT
1192   ompt_data_t ompt_parallel_data = ompt_data_none;
1193   ompt_data_t *implicit_task_data;
1194   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1195   if (ompt_enabled.enabled &&
1196       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1197 
1198     ompt_task_info_t *parent_task_info;
1199     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1200 
1201     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1202     if (ompt_enabled.ompt_callback_parallel_begin) {
1203       int team_size = 1;
1204 
1205       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1206           &(parent_task_info->task_data), &(parent_task_info->frame),
1207           &ompt_parallel_data, team_size,
1208           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1209     }
1210   }
1211 #endif // OMPT_SUPPORT
1212 
1213   if (this_thr->th.th_team != serial_team) {
1214     // Nested level will be an index in the nested nthreads array
1215     int level = this_thr->th.th_team->t.t_level;
1216 
1217     if (serial_team->t.t_serialized) {
1218       /* this serial team was already used
1219          TODO increase performance by making this locks more specific */
1220       kmp_team_t *new_team;
1221 
1222       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1223 
1224       new_team =
1225           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1226 #if OMPT_SUPPORT
1227                               ompt_parallel_data,
1228 #endif
1229                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1230                               0 USE_NESTED_HOT_ARG(NULL));
1231       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1232       KMP_ASSERT(new_team);
1233 
1234       /* setup new serialized team and install it */
1235       new_team->t.t_threads[0] = this_thr;
1236       new_team->t.t_parent = this_thr->th.th_team;
1237       serial_team = new_team;
1238       this_thr->th.th_serial_team = serial_team;
1239 
1240       KF_TRACE(
1241           10,
1242           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1243            global_tid, serial_team));
1244 
1245       /* TODO the above breaks the requirement that if we run out of resources,
1246          then we can still guarantee that serialized teams are ok, since we may
1247          need to allocate a new one */
1248     } else {
1249       KF_TRACE(
1250           10,
1251           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1252            global_tid, serial_team));
1253     }
1254 
1255     /* we have to initialize this serial team */
1256     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1257     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1258     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1259     serial_team->t.t_ident = loc;
1260     serial_team->t.t_serialized = 1;
1261     serial_team->t.t_nproc = 1;
1262     serial_team->t.t_parent = this_thr->th.th_team;
1263     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1264     this_thr->th.th_team = serial_team;
1265     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1266 
1267     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1268                   this_thr->th.th_current_task));
1269     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1270     this_thr->th.th_current_task->td_flags.executing = 0;
1271 
1272     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1273 
1274     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1275        implicit task for each serialized task represented by
1276        team->t.t_serialized? */
1277     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1278               &this_thr->th.th_current_task->td_parent->td_icvs);
1279 
1280     // Thread value exists in the nested nthreads array for the next nested
1281     // level
1282     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1283       this_thr->th.th_current_task->td_icvs.nproc =
1284           __kmp_nested_nth.nth[level + 1];
1285     }
1286 
1287     if (__kmp_nested_proc_bind.used &&
1288         (level + 1 < __kmp_nested_proc_bind.used)) {
1289       this_thr->th.th_current_task->td_icvs.proc_bind =
1290           __kmp_nested_proc_bind.bind_types[level + 1];
1291     }
1292 
1293 #if USE_DEBUGGER
1294     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1295 #endif
1296     this_thr->th.th_info.ds.ds_tid = 0;
1297 
1298     /* set thread cache values */
1299     this_thr->th.th_team_nproc = 1;
1300     this_thr->th.th_team_master = this_thr;
1301     this_thr->th.th_team_serialized = 1;
1302 
1303     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1304     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1305     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1306 
1307     propagateFPControl(serial_team);
1308 
1309     /* check if we need to allocate dispatch buffers stack */
1310     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1311     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1312       serial_team->t.t_dispatch->th_disp_buffer =
1313           (dispatch_private_info_t *)__kmp_allocate(
1314               sizeof(dispatch_private_info_t));
1315     }
1316     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1317 
1318     KMP_MB();
1319 
1320   } else {
1321     /* this serialized team is already being used,
1322      * that's fine, just add another nested level */
1323     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1324     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1325     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1326     ++serial_team->t.t_serialized;
1327     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1328 
1329     // Nested level will be an index in the nested nthreads array
1330     int level = this_thr->th.th_team->t.t_level;
1331     // Thread value exists in the nested nthreads array for the next nested
1332     // level
1333     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1334       this_thr->th.th_current_task->td_icvs.nproc =
1335           __kmp_nested_nth.nth[level + 1];
1336     }
1337     serial_team->t.t_level++;
1338     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1339                   "of serial team %p to %d\n",
1340                   global_tid, serial_team, serial_team->t.t_level));
1341 
1342     /* allocate/push dispatch buffers stack */
1343     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1344     {
1345       dispatch_private_info_t *disp_buffer =
1346           (dispatch_private_info_t *)__kmp_allocate(
1347               sizeof(dispatch_private_info_t));
1348       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1349       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1350     }
1351     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1352 
1353     KMP_MB();
1354   }
1355   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1356 
1357   // Perform the display affinity functionality for
1358   // serialized parallel regions
1359   if (__kmp_display_affinity) {
1360     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1361         this_thr->th.th_prev_num_threads != 1) {
1362       // NULL means use the affinity-format-var ICV
1363       __kmp_aux_display_affinity(global_tid, NULL);
1364       this_thr->th.th_prev_level = serial_team->t.t_level;
1365       this_thr->th.th_prev_num_threads = 1;
1366     }
1367   }
1368 
1369   if (__kmp_env_consistency_check)
1370     __kmp_push_parallel(global_tid, NULL);
1371 #if OMPT_SUPPORT
1372   serial_team->t.ompt_team_info.master_return_address = codeptr;
1373   if (ompt_enabled.enabled &&
1374       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1375     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1376 
1377     ompt_lw_taskteam_t lw_taskteam;
1378     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1379                             &ompt_parallel_data, codeptr);
1380 
1381     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1382     // don't use lw_taskteam after linking. content was swaped
1383 
1384     /* OMPT implicit task begin */
1385     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1386     if (ompt_enabled.ompt_callback_implicit_task) {
1387       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1388           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1389           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1390       OMPT_CUR_TASK_INFO(this_thr)
1391           ->thread_num = __kmp_tid_from_gtid(global_tid);
1392     }
1393 
1394     /* OMPT state */
1395     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1396     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1397   }
1398 #endif
1399 }
1400 
1401 /* most of the work for a fork */
1402 /* return true if we really went parallel, false if serialized */
1403 int __kmp_fork_call(ident_t *loc, int gtid,
1404                     enum fork_context_e call_context, // Intel, GNU, ...
1405                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1406                     kmp_va_list ap) {
1407   void **argv;
1408   int i;
1409   int master_tid;
1410   int master_this_cons;
1411   kmp_team_t *team;
1412   kmp_team_t *parent_team;
1413   kmp_info_t *master_th;
1414   kmp_root_t *root;
1415   int nthreads;
1416   int master_active;
1417   int master_set_numthreads;
1418   int level;
1419   int active_level;
1420   int teams_level;
1421 #if KMP_NESTED_HOT_TEAMS
1422   kmp_hot_team_ptr_t **p_hot_teams;
1423 #endif
1424   { // KMP_TIME_BLOCK
1425     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1426     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1427 
1428     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1429     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1430       /* Some systems prefer the stack for the root thread(s) to start with */
1431       /* some gap from the parent stack to prevent false sharing. */
1432       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1433       /* These 2 lines below are so this does not get optimized out */
1434       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1435         __kmp_stkpadding += (short)((kmp_int64)dummy);
1436     }
1437 
1438     /* initialize if needed */
1439     KMP_DEBUG_ASSERT(
1440         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1441     if (!TCR_4(__kmp_init_parallel))
1442       __kmp_parallel_initialize();
1443     __kmp_resume_if_soft_paused();
1444 
1445     /* setup current data */
1446     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1447     // shutdown
1448     parent_team = master_th->th.th_team;
1449     master_tid = master_th->th.th_info.ds.ds_tid;
1450     master_this_cons = master_th->th.th_local.this_construct;
1451     root = master_th->th.th_root;
1452     master_active = root->r.r_active;
1453     master_set_numthreads = master_th->th.th_set_nproc;
1454 
1455 #if OMPT_SUPPORT
1456     ompt_data_t ompt_parallel_data = ompt_data_none;
1457     ompt_data_t *parent_task_data;
1458     ompt_frame_t *ompt_frame;
1459     ompt_data_t *implicit_task_data;
1460     void *return_address = NULL;
1461 
1462     if (ompt_enabled.enabled) {
1463       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1464                                     NULL, NULL);
1465       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1466     }
1467 #endif
1468 
1469     // Nested level will be an index in the nested nthreads array
1470     level = parent_team->t.t_level;
1471     // used to launch non-serial teams even if nested is not allowed
1472     active_level = parent_team->t.t_active_level;
1473     // needed to check nesting inside the teams
1474     teams_level = master_th->th.th_teams_level;
1475 #if KMP_NESTED_HOT_TEAMS
1476     p_hot_teams = &master_th->th.th_hot_teams;
1477     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1478       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1479           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1480       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1481       // it is either actual or not needed (when active_level > 0)
1482       (*p_hot_teams)[0].hot_team_nth = 1;
1483     }
1484 #endif
1485 
1486 #if OMPT_SUPPORT
1487     if (ompt_enabled.enabled) {
1488       if (ompt_enabled.ompt_callback_parallel_begin) {
1489         int team_size = master_set_numthreads
1490                             ? master_set_numthreads
1491                             : get__nproc_2(parent_team, master_tid);
1492         int flags = OMPT_INVOKER(call_context) |
1493                     ((microtask == (microtask_t)__kmp_teams_master)
1494                          ? ompt_parallel_league
1495                          : ompt_parallel_team);
1496         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1497             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1498             return_address);
1499       }
1500       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1501     }
1502 #endif
1503 
1504     master_th->th.th_ident = loc;
1505 
1506     if (master_th->th.th_teams_microtask && ap &&
1507         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1508       // AC: This is start of parallel that is nested inside teams construct.
1509       // The team is actual (hot), all workers are ready at the fork barrier.
1510       // No lock needed to initialize the team a bit, then free workers.
1511       parent_team->t.t_ident = loc;
1512       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1513       parent_team->t.t_argc = argc;
1514       argv = (void **)parent_team->t.t_argv;
1515       for (i = argc - 1; i >= 0; --i)
1516         *argv++ = va_arg(kmp_va_deref(ap), void *);
1517       // Increment our nested depth levels, but not increase the serialization
1518       if (parent_team == master_th->th.th_serial_team) {
1519         // AC: we are in serialized parallel
1520         __kmpc_serialized_parallel(loc, gtid);
1521         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1522 
1523         if (call_context == fork_context_gnu) {
1524           // AC: need to decrement t_serialized for enquiry functions to work
1525           // correctly, will restore at join time
1526           parent_team->t.t_serialized--;
1527           return TRUE;
1528         }
1529 
1530 #if OMPT_SUPPORT
1531         void *dummy;
1532         void **exit_frame_p;
1533 
1534         ompt_lw_taskteam_t lw_taskteam;
1535 
1536         if (ompt_enabled.enabled) {
1537           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1538                                   &ompt_parallel_data, return_address);
1539           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1540 
1541           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1542           // don't use lw_taskteam after linking. content was swaped
1543 
1544           /* OMPT implicit task begin */
1545           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1546           if (ompt_enabled.ompt_callback_implicit_task) {
1547             OMPT_CUR_TASK_INFO(master_th)
1548                 ->thread_num = __kmp_tid_from_gtid(gtid);
1549             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1550                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1551                 implicit_task_data, 1,
1552                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1553           }
1554 
1555           /* OMPT state */
1556           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1557         } else {
1558           exit_frame_p = &dummy;
1559         }
1560 #endif
1561         // AC: need to decrement t_serialized for enquiry functions to work
1562         // correctly, will restore at join time
1563         parent_team->t.t_serialized--;
1564 
1565         {
1566           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1567           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1568           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1569 #if OMPT_SUPPORT
1570                                  ,
1571                                  exit_frame_p
1572 #endif
1573                                  );
1574         }
1575 
1576 #if OMPT_SUPPORT
1577         if (ompt_enabled.enabled) {
1578           *exit_frame_p = NULL;
1579           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1580           if (ompt_enabled.ompt_callback_implicit_task) {
1581             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1582                 ompt_scope_end, NULL, implicit_task_data, 1,
1583                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1584           }
1585           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1586           __ompt_lw_taskteam_unlink(master_th);
1587           if (ompt_enabled.ompt_callback_parallel_end) {
1588             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1589                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1590                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1591                 return_address);
1592           }
1593           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1594         }
1595 #endif
1596         return TRUE;
1597       }
1598 
1599       parent_team->t.t_pkfn = microtask;
1600       parent_team->t.t_invoke = invoker;
1601       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1602       parent_team->t.t_active_level++;
1603       parent_team->t.t_level++;
1604       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1605 
1606 #if OMPT_SUPPORT
1607       if (ompt_enabled.enabled) {
1608         ompt_lw_taskteam_t lw_taskteam;
1609         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1610                                 &ompt_parallel_data, return_address);
1611         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1612       }
1613 #endif
1614 
1615       /* Change number of threads in the team if requested */
1616       if (master_set_numthreads) { // The parallel has num_threads clause
1617         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1618           // AC: only can reduce number of threads dynamically, can't increase
1619           kmp_info_t **other_threads = parent_team->t.t_threads;
1620           parent_team->t.t_nproc = master_set_numthreads;
1621           for (i = 0; i < master_set_numthreads; ++i) {
1622             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1623           }
1624           // Keep extra threads hot in the team for possible next parallels
1625         }
1626         master_th->th.th_set_nproc = 0;
1627       }
1628 
1629 #if USE_DEBUGGER
1630       if (__kmp_debugging) { // Let debugger override number of threads.
1631         int nth = __kmp_omp_num_threads(loc);
1632         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1633           master_set_numthreads = nth;
1634         }
1635       }
1636 #endif
1637 
1638 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1639       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1640            KMP_ITT_DEBUG) &&
1641           __kmp_forkjoin_frames_mode == 3 &&
1642           parent_team->t.t_active_level == 1 // only report frames at level 1
1643           && master_th->th.th_teams_size.nteams == 1) {
1644         kmp_uint64 tmp_time = __itt_get_timestamp();
1645         master_th->th.th_frame_time = tmp_time;
1646         parent_team->t.t_region_time = tmp_time;
1647       }
1648       if (__itt_stack_caller_create_ptr) {
1649         // create new stack stitching id before entering fork barrier
1650         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1651       }
1652 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1653 
1654       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1655                     "master_th=%p, gtid=%d\n",
1656                     root, parent_team, master_th, gtid));
1657       __kmp_internal_fork(loc, gtid, parent_team);
1658       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1659                     "master_th=%p, gtid=%d\n",
1660                     root, parent_team, master_th, gtid));
1661 
1662       if (call_context == fork_context_gnu)
1663         return TRUE;
1664 
1665       /* Invoke microtask for MASTER thread */
1666       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1667                     parent_team->t.t_id, parent_team->t.t_pkfn));
1668 
1669       if (!parent_team->t.t_invoke(gtid)) {
1670         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1671       }
1672       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1673                     parent_team->t.t_id, parent_team->t.t_pkfn));
1674       KMP_MB(); /* Flush all pending memory write invalidates.  */
1675 
1676       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1677 
1678       return TRUE;
1679     } // Parallel closely nested in teams construct
1680 
1681 #if KMP_DEBUG
1682     if (__kmp_tasking_mode != tskm_immediate_exec) {
1683       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1684                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1685     }
1686 #endif
1687 
1688     if (parent_team->t.t_active_level >=
1689         master_th->th.th_current_task->td_icvs.max_active_levels) {
1690       nthreads = 1;
1691     } else {
1692       int enter_teams = ((ap == NULL && active_level == 0) ||
1693                          (ap && teams_level > 0 && teams_level == level));
1694       nthreads =
1695           master_set_numthreads
1696               ? master_set_numthreads
1697               : get__nproc_2(
1698                     parent_team,
1699                     master_tid); // TODO: get nproc directly from current task
1700 
1701       // Check if we need to take forkjoin lock? (no need for serialized
1702       // parallel out of teams construct). This code moved here from
1703       // __kmp_reserve_threads() to speedup nested serialized parallels.
1704       if (nthreads > 1) {
1705         if ((get__max_active_levels(master_th) == 1 &&
1706              (root->r.r_in_parallel && !enter_teams)) ||
1707             (__kmp_library == library_serial)) {
1708           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1709                         " threads\n",
1710                         gtid, nthreads));
1711           nthreads = 1;
1712         }
1713       }
1714       if (nthreads > 1) {
1715         /* determine how many new threads we can use */
1716         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1717         /* AC: If we execute teams from parallel region (on host), then teams
1718            should be created but each can only have 1 thread if nesting is
1719            disabled. If teams called from serial region, then teams and their
1720            threads should be created regardless of the nesting setting. */
1721         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1722                                          nthreads, enter_teams);
1723         if (nthreads == 1) {
1724           // Free lock for single thread execution here; for multi-thread
1725           // execution it will be freed later after team of threads created
1726           // and initialized
1727           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1728         }
1729       }
1730     }
1731     KMP_DEBUG_ASSERT(nthreads > 0);
1732 
1733     // If we temporarily changed the set number of threads then restore it now
1734     master_th->th.th_set_nproc = 0;
1735 
1736     /* create a serialized parallel region? */
1737     if (nthreads == 1) {
1738 /* josh todo: hypothetical question: what do we do for OS X*? */
1739 #if KMP_OS_LINUX &&                                                            \
1740     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1741       void *args[argc];
1742 #else
1743       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1744 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1745           KMP_ARCH_AARCH64) */
1746 
1747       KA_TRACE(20,
1748                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1749 
1750       __kmpc_serialized_parallel(loc, gtid);
1751 
1752       if (call_context == fork_context_intel) {
1753         /* TODO this sucks, use the compiler itself to pass args! :) */
1754         master_th->th.th_serial_team->t.t_ident = loc;
1755         if (!ap) {
1756           // revert change made in __kmpc_serialized_parallel()
1757           master_th->th.th_serial_team->t.t_level--;
1758 // Get args from parent team for teams construct
1759 
1760 #if OMPT_SUPPORT
1761           void *dummy;
1762           void **exit_frame_p;
1763           ompt_task_info_t *task_info;
1764 
1765           ompt_lw_taskteam_t lw_taskteam;
1766 
1767           if (ompt_enabled.enabled) {
1768             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1769                                     &ompt_parallel_data, return_address);
1770 
1771             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1772             // don't use lw_taskteam after linking. content was swaped
1773 
1774             task_info = OMPT_CUR_TASK_INFO(master_th);
1775             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1776             if (ompt_enabled.ompt_callback_implicit_task) {
1777               OMPT_CUR_TASK_INFO(master_th)
1778                   ->thread_num = __kmp_tid_from_gtid(gtid);
1779               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1780                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1781                   &(task_info->task_data), 1,
1782                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783                   ompt_task_implicit);
1784             }
1785 
1786             /* OMPT state */
1787             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1788           } else {
1789             exit_frame_p = &dummy;
1790           }
1791 #endif
1792 
1793           {
1794             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1795             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1796             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1797                                    parent_team->t.t_argv
1798 #if OMPT_SUPPORT
1799                                    ,
1800                                    exit_frame_p
1801 #endif
1802                                    );
1803           }
1804 
1805 #if OMPT_SUPPORT
1806           if (ompt_enabled.enabled) {
1807             *exit_frame_p = NULL;
1808             if (ompt_enabled.ompt_callback_implicit_task) {
1809               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1810                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1811                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1812                   ompt_task_implicit);
1813             }
1814             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1815             __ompt_lw_taskteam_unlink(master_th);
1816             if (ompt_enabled.ompt_callback_parallel_end) {
1817               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1818                   &ompt_parallel_data, parent_task_data,
1819                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1820                   return_address);
1821             }
1822             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1823           }
1824 #endif
1825         } else if (microtask == (microtask_t)__kmp_teams_master) {
1826           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1827                            master_th->th.th_serial_team);
1828           team = master_th->th.th_team;
1829           // team->t.t_pkfn = microtask;
1830           team->t.t_invoke = invoker;
1831           __kmp_alloc_argv_entries(argc, team, TRUE);
1832           team->t.t_argc = argc;
1833           argv = (void **)team->t.t_argv;
1834           if (ap) {
1835             for (i = argc - 1; i >= 0; --i)
1836               *argv++ = va_arg(kmp_va_deref(ap), void *);
1837           } else {
1838             for (i = 0; i < argc; ++i)
1839               // Get args from parent team for teams construct
1840               argv[i] = parent_team->t.t_argv[i];
1841           }
1842           // AC: revert change made in __kmpc_serialized_parallel()
1843           //     because initial code in teams should have level=0
1844           team->t.t_level--;
1845           // AC: call special invoker for outer "parallel" of teams construct
1846           invoker(gtid);
1847 #if OMPT_SUPPORT
1848           if (ompt_enabled.enabled) {
1849             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1850             if (ompt_enabled.ompt_callback_implicit_task) {
1851               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1852                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1853                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1854             }
1855             if (ompt_enabled.ompt_callback_parallel_end) {
1856               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1857                   &ompt_parallel_data, parent_task_data,
1858                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1859                   return_address);
1860             }
1861             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1862           }
1863 #endif
1864         } else {
1865           argv = args;
1866           for (i = argc - 1; i >= 0; --i)
1867             *argv++ = va_arg(kmp_va_deref(ap), void *);
1868           KMP_MB();
1869 
1870 #if OMPT_SUPPORT
1871           void *dummy;
1872           void **exit_frame_p;
1873           ompt_task_info_t *task_info;
1874 
1875           ompt_lw_taskteam_t lw_taskteam;
1876 
1877           if (ompt_enabled.enabled) {
1878             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1879                                     &ompt_parallel_data, return_address);
1880             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1881             // don't use lw_taskteam after linking. content was swaped
1882             task_info = OMPT_CUR_TASK_INFO(master_th);
1883             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1884 
1885             /* OMPT implicit task begin */
1886             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1887             if (ompt_enabled.ompt_callback_implicit_task) {
1888               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1889                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1890                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1891                   ompt_task_implicit);
1892               OMPT_CUR_TASK_INFO(master_th)
1893                   ->thread_num = __kmp_tid_from_gtid(gtid);
1894             }
1895 
1896             /* OMPT state */
1897             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1898           } else {
1899             exit_frame_p = &dummy;
1900           }
1901 #endif
1902 
1903           {
1904             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1905             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1906             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1907 #if OMPT_SUPPORT
1908                                    ,
1909                                    exit_frame_p
1910 #endif
1911                                    );
1912           }
1913 
1914 #if OMPT_SUPPORT
1915           if (ompt_enabled.enabled) {
1916             *exit_frame_p = NULL;
1917             if (ompt_enabled.ompt_callback_implicit_task) {
1918               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1919                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1920                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1921                   ompt_task_implicit);
1922             }
1923 
1924             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1925             __ompt_lw_taskteam_unlink(master_th);
1926             if (ompt_enabled.ompt_callback_parallel_end) {
1927               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1928                   &ompt_parallel_data, parent_task_data,
1929                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1930                   return_address);
1931             }
1932             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1933           }
1934 #endif
1935         }
1936       } else if (call_context == fork_context_gnu) {
1937 #if OMPT_SUPPORT
1938         ompt_lw_taskteam_t lwt;
1939         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1940                                 return_address);
1941 
1942         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1943         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1944 // don't use lw_taskteam after linking. content was swaped
1945 #endif
1946 
1947         // we were called from GNU native code
1948         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1949         return FALSE;
1950       } else {
1951         KMP_ASSERT2(call_context < fork_context_last,
1952                     "__kmp_fork_call: unknown fork_context parameter");
1953       }
1954 
1955       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1956       KMP_MB();
1957       return FALSE;
1958     } // if (nthreads == 1)
1959 
1960     // GEH: only modify the executing flag in the case when not serialized
1961     //      serialized case is handled in kmpc_serialized_parallel
1962     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1963                   "curtask=%p, curtask_max_aclevel=%d\n",
1964                   parent_team->t.t_active_level, master_th,
1965                   master_th->th.th_current_task,
1966                   master_th->th.th_current_task->td_icvs.max_active_levels));
1967     // TODO: GEH - cannot do this assertion because root thread not set up as
1968     // executing
1969     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1970     master_th->th.th_current_task->td_flags.executing = 0;
1971 
1972     if (!master_th->th.th_teams_microtask || level > teams_level) {
1973       /* Increment our nested depth level */
1974       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1975     }
1976 
1977     // See if we need to make a copy of the ICVs.
1978     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1979     if ((level + 1 < __kmp_nested_nth.used) &&
1980         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1981       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1982     } else {
1983       nthreads_icv = 0; // don't update
1984     }
1985 
1986     // Figure out the proc_bind_policy for the new team.
1987     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1988     kmp_proc_bind_t proc_bind_icv =
1989         proc_bind_default; // proc_bind_default means don't update
1990     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1991       proc_bind = proc_bind_false;
1992     } else {
1993       if (proc_bind == proc_bind_default) {
1994         // No proc_bind clause specified; use current proc-bind-var for this
1995         // parallel region
1996         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1997       }
1998       /* else: The proc_bind policy was specified explicitly on parallel clause.
1999          This overrides proc-bind-var for this parallel region, but does not
2000          change proc-bind-var. */
2001       // Figure the value of proc-bind-var for the child threads.
2002       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2003           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2004            master_th->th.th_current_task->td_icvs.proc_bind)) {
2005         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2006       }
2007     }
2008 
2009     // Reset for next parallel region
2010     master_th->th.th_set_proc_bind = proc_bind_default;
2011 
2012     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2013       kmp_internal_control_t new_icvs;
2014       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2015       new_icvs.next = NULL;
2016       if (nthreads_icv > 0) {
2017         new_icvs.nproc = nthreads_icv;
2018       }
2019       if (proc_bind_icv != proc_bind_default) {
2020         new_icvs.proc_bind = proc_bind_icv;
2021       }
2022 
2023       /* allocate a new parallel team */
2024       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2025       team = __kmp_allocate_team(root, nthreads, nthreads,
2026 #if OMPT_SUPPORT
2027                                  ompt_parallel_data,
2028 #endif
2029                                  proc_bind, &new_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     } else {
2032       /* allocate a new parallel team */
2033       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2034       team = __kmp_allocate_team(root, nthreads, nthreads,
2035 #if OMPT_SUPPORT
2036                                  ompt_parallel_data,
2037 #endif
2038                                  proc_bind,
2039                                  &master_th->th.th_current_task->td_icvs,
2040                                  argc USE_NESTED_HOT_ARG(master_th));
2041     }
2042     KF_TRACE(
2043         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2044 
2045     /* setup the new team */
2046     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2047     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2048     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2049     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2050     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2051 #if OMPT_SUPPORT
2052     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2053                           return_address);
2054 #endif
2055     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2056     // TODO: parent_team->t.t_level == INT_MAX ???
2057     if (!master_th->th.th_teams_microtask || level > teams_level) {
2058       int new_level = parent_team->t.t_level + 1;
2059       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2060       new_level = parent_team->t.t_active_level + 1;
2061       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2062     } else {
2063       // AC: Do not increase parallel level at start of the teams construct
2064       int new_level = parent_team->t.t_level;
2065       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2066       new_level = parent_team->t.t_active_level;
2067       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2068     }
2069     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2070     // set master's schedule as new run-time schedule
2071     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2072 
2073     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2074     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2075 
2076     // Update the floating point rounding in the team if required.
2077     propagateFPControl(team);
2078 
2079     if (__kmp_tasking_mode != tskm_immediate_exec) {
2080       // Set master's task team to team's task team. Unless this is hot team, it
2081       // should be NULL.
2082       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2083                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2084       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2085                     "%p, new task_team %p / team %p\n",
2086                     __kmp_gtid_from_thread(master_th),
2087                     master_th->th.th_task_team, parent_team,
2088                     team->t.t_task_team[master_th->th.th_task_state], team));
2089 
2090       if (active_level || master_th->th.th_task_team) {
2091         // Take a memo of master's task_state
2092         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2093         if (master_th->th.th_task_state_top >=
2094             master_th->th.th_task_state_stack_sz) { // increase size
2095           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2096           kmp_uint8 *old_stack, *new_stack;
2097           kmp_uint32 i;
2098           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2099           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2100             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2101           }
2102           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2103                ++i) { // zero-init rest of stack
2104             new_stack[i] = 0;
2105           }
2106           old_stack = master_th->th.th_task_state_memo_stack;
2107           master_th->th.th_task_state_memo_stack = new_stack;
2108           master_th->th.th_task_state_stack_sz = new_size;
2109           __kmp_free(old_stack);
2110         }
2111         // Store master's task_state on stack
2112         master_th->th
2113             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2114             master_th->th.th_task_state;
2115         master_th->th.th_task_state_top++;
2116 #if KMP_NESTED_HOT_TEAMS
2117         if (master_th->th.th_hot_teams &&
2118             active_level < __kmp_hot_teams_max_level &&
2119             team == master_th->th.th_hot_teams[active_level].hot_team) {
2120           // Restore master's nested state if nested hot team
2121           master_th->th.th_task_state =
2122               master_th->th
2123                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2124         } else {
2125 #endif
2126           master_th->th.th_task_state = 0;
2127 #if KMP_NESTED_HOT_TEAMS
2128         }
2129 #endif
2130       }
2131 #if !KMP_NESTED_HOT_TEAMS
2132       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2133                        (team == root->r.r_hot_team));
2134 #endif
2135     }
2136 
2137     KA_TRACE(
2138         20,
2139         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2140          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2141          team->t.t_nproc));
2142     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2143                      (team->t.t_master_tid == 0 &&
2144                       (team->t.t_parent == root->r.r_root_team ||
2145                        team->t.t_parent->t.t_serialized)));
2146     KMP_MB();
2147 
2148     /* now, setup the arguments */
2149     argv = (void **)team->t.t_argv;
2150     if (ap) {
2151       for (i = argc - 1; i >= 0; --i) {
2152         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156     } else {
2157       for (i = 0; i < argc; ++i) {
2158         // Get args from parent team for teams construct
2159         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2160       }
2161     }
2162 
2163     /* now actually fork the threads */
2164     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2165     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2166       root->r.r_active = TRUE;
2167 
2168     __kmp_fork_team_threads(root, team, master_th, gtid);
2169     __kmp_setup_icv_copy(team, nthreads,
2170                          &master_th->th.th_current_task->td_icvs, loc);
2171 
2172 #if OMPT_SUPPORT
2173     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2174 #endif
2175 
2176     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2177 
2178 #if USE_ITT_BUILD
2179     if (team->t.t_active_level == 1 // only report frames at level 1
2180         && !master_th->th.th_teams_microtask) { // not in teams construct
2181 #if USE_ITT_NOTIFY
2182       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2183           (__kmp_forkjoin_frames_mode == 3 ||
2184            __kmp_forkjoin_frames_mode == 1)) {
2185         kmp_uint64 tmp_time = 0;
2186         if (__itt_get_timestamp_ptr)
2187           tmp_time = __itt_get_timestamp();
2188         // Internal fork - report frame begin
2189         master_th->th.th_frame_time = tmp_time;
2190         if (__kmp_forkjoin_frames_mode == 3)
2191           team->t.t_region_time = tmp_time;
2192       } else
2193 // only one notification scheme (either "submit" or "forking/joined", not both)
2194 #endif /* USE_ITT_NOTIFY */
2195           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2196               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2197         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2198         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2199       }
2200     }
2201 #endif /* USE_ITT_BUILD */
2202 
2203     /* now go on and do the work */
2204     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2205     KMP_MB();
2206     KF_TRACE(10,
2207              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2208               root, team, master_th, gtid));
2209 
2210 #if USE_ITT_BUILD
2211     if (__itt_stack_caller_create_ptr) {
2212       team->t.t_stack_id =
2213           __kmp_itt_stack_caller_create(); // create new stack stitching id
2214       // before entering fork barrier
2215     }
2216 #endif /* USE_ITT_BUILD */
2217 
2218     // AC: skip __kmp_internal_fork at teams construct, let only master
2219     // threads execute
2220     if (ap) {
2221       __kmp_internal_fork(loc, gtid, team);
2222       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2223                     "master_th=%p, gtid=%d\n",
2224                     root, team, master_th, gtid));
2225     }
2226 
2227     if (call_context == fork_context_gnu) {
2228       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2229       return TRUE;
2230     }
2231 
2232     /* Invoke microtask for MASTER thread */
2233     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2234                   team->t.t_id, team->t.t_pkfn));
2235   } // END of timer KMP_fork_call block
2236 
2237 #if KMP_STATS_ENABLED
2238   // If beginning a teams construct, then change thread state
2239   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2240   if (!ap) {
2241     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2242   }
2243 #endif
2244 
2245   if (!team->t.t_invoke(gtid)) {
2246     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2247   }
2248 
2249 #if KMP_STATS_ENABLED
2250   // If was beginning of a teams construct, then reset thread state
2251   if (!ap) {
2252     KMP_SET_THREAD_STATE(previous_state);
2253   }
2254 #endif
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2257                 team->t.t_id, team->t.t_pkfn));
2258   KMP_MB(); /* Flush all pending memory write invalidates.  */
2259 
2260   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2261 
2262 #if OMPT_SUPPORT
2263   if (ompt_enabled.enabled) {
2264     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2265   }
2266 #endif
2267 
2268   return TRUE;
2269 }
2270 
2271 #if OMPT_SUPPORT
2272 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2273                                             kmp_team_t *team) {
2274   // restore state outside the region
2275   thread->th.ompt_thread_info.state =
2276       ((team->t.t_serialized) ? ompt_state_work_serial
2277                               : ompt_state_work_parallel);
2278 }
2279 
2280 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2281                                    kmp_team_t *team, ompt_data_t *parallel_data,
2282                                    int flags, void *codeptr) {
2283   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2284   if (ompt_enabled.ompt_callback_parallel_end) {
2285     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2286         parallel_data, &(task_info->task_data), flags, codeptr);
2287   }
2288 
2289   task_info->frame.enter_frame = ompt_data_none;
2290   __kmp_join_restore_state(thread, team);
2291 }
2292 #endif
2293 
2294 void __kmp_join_call(ident_t *loc, int gtid
2295 #if OMPT_SUPPORT
2296                      ,
2297                      enum fork_context_e fork_context
2298 #endif
2299                      ,
2300                      int exit_teams) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307 
2308   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2309 
2310   /* setup current data */
2311   master_th = __kmp_threads[gtid];
2312   root = master_th->th.th_root;
2313   team = master_th->th.th_team;
2314   parent_team = team->t.t_parent;
2315 
2316   master_th->th.th_ident = loc;
2317 
2318 #if OMPT_SUPPORT
2319   void *team_microtask = (void *)team->t.t_pkfn;
2320   // For GOMP interface with serialized parallel, need the
2321   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2322   // and end-parallel events.
2323   if (ompt_enabled.enabled &&
2324       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2325     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2326   }
2327 #endif
2328 
2329 #if KMP_DEBUG
2330   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2331     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2332                   "th_task_team = %p\n",
2333                   __kmp_gtid_from_thread(master_th), team,
2334                   team->t.t_task_team[master_th->th.th_task_state],
2335                   master_th->th.th_task_team));
2336     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2337                      team->t.t_task_team[master_th->th.th_task_state]);
2338   }
2339 #endif
2340 
2341   if (team->t.t_serialized) {
2342     if (master_th->th.th_teams_microtask) {
2343       // We are in teams construct
2344       int level = team->t.t_level;
2345       int tlevel = master_th->th.th_teams_level;
2346       if (level == tlevel) {
2347         // AC: we haven't incremented it earlier at start of teams construct,
2348         //     so do it here - at the end of teams construct
2349         team->t.t_level++;
2350       } else if (level == tlevel + 1) {
2351         // AC: we are exiting parallel inside teams, need to increment
2352         // serialization in order to restore it in the next call to
2353         // __kmpc_end_serialized_parallel
2354         team->t.t_serialized++;
2355       }
2356     }
2357     __kmpc_end_serialized_parallel(loc, gtid);
2358 
2359 #if OMPT_SUPPORT
2360     if (ompt_enabled.enabled) {
2361       __kmp_join_restore_state(master_th, parent_team);
2362     }
2363 #endif
2364 
2365     return;
2366   }
2367 
2368   master_active = team->t.t_master_active;
2369 
2370   if (!exit_teams) {
2371     // AC: No barrier for internal teams at exit from teams construct.
2372     //     But there is barrier for external team (league).
2373     __kmp_internal_join(loc, gtid, team);
2374   } else {
2375     master_th->th.th_task_state =
2376         0; // AC: no tasking in teams (out of any parallel)
2377   }
2378 
2379   KMP_MB();
2380 
2381 #if OMPT_SUPPORT
2382   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2383   void *codeptr = team->t.ompt_team_info.master_return_address;
2384 #endif
2385 
2386 #if USE_ITT_BUILD
2387   if (__itt_stack_caller_create_ptr) {
2388     // destroy the stack stitching id after join barrier
2389     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2390   }
2391   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2392   if (team->t.t_active_level == 1 &&
2393       (!master_th->th.th_teams_microtask || /* not in teams construct */
2394        master_th->th.th_teams_size.nteams == 1)) {
2395     master_th->th.th_ident = loc;
2396     // only one notification scheme (either "submit" or "forking/joined", not
2397     // both)
2398     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2399         __kmp_forkjoin_frames_mode == 3)
2400       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2401                              master_th->th.th_frame_time, 0, loc,
2402                              master_th->th.th_team_nproc, 1);
2403     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2404              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2405       __kmp_itt_region_joined(gtid);
2406   } // active_level == 1
2407 #endif /* USE_ITT_BUILD */
2408 
2409   if (master_th->th.th_teams_microtask && !exit_teams &&
2410       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2411       team->t.t_level == master_th->th.th_teams_level + 1) {
2412 // AC: We need to leave the team structure intact at the end of parallel
2413 // inside the teams construct, so that at the next parallel same (hot) team
2414 // works, only adjust nesting levels
2415 #if OMPT_SUPPORT
2416     ompt_data_t ompt_parallel_data = ompt_data_none;
2417     if (ompt_enabled.enabled) {
2418       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2419       if (ompt_enabled.ompt_callback_implicit_task) {
2420         int ompt_team_size = team->t.t_nproc;
2421         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2422             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2423             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2424       }
2425       task_info->frame.exit_frame = ompt_data_none;
2426       task_info->task_data = ompt_data_none;
2427       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2428       __ompt_lw_taskteam_unlink(master_th);
2429     }
2430 #endif
2431     /* Decrement our nested depth level */
2432     team->t.t_level--;
2433     team->t.t_active_level--;
2434     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2435 
2436     // Restore number of threads in the team if needed. This code relies on
2437     // the proper adjustment of th_teams_size.nth after the fork in
2438     // __kmp_teams_master on each teams master in the case that
2439     // __kmp_reserve_threads reduced it.
2440     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2441       int old_num = master_th->th.th_team_nproc;
2442       int new_num = master_th->th.th_teams_size.nth;
2443       kmp_info_t **other_threads = team->t.t_threads;
2444       team->t.t_nproc = new_num;
2445       for (int i = 0; i < old_num; ++i) {
2446         other_threads[i]->th.th_team_nproc = new_num;
2447       }
2448       // Adjust states of non-used threads of the team
2449       for (int i = old_num; i < new_num; ++i) {
2450         // Re-initialize thread's barrier data.
2451         KMP_DEBUG_ASSERT(other_threads[i]);
2452         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2453         for (int b = 0; b < bs_last_barrier; ++b) {
2454           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2455           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2456 #if USE_DEBUGGER
2457           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2458 #endif
2459         }
2460         if (__kmp_tasking_mode != tskm_immediate_exec) {
2461           // Synchronize thread's task state
2462           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2463         }
2464       }
2465     }
2466 
2467 #if OMPT_SUPPORT
2468     if (ompt_enabled.enabled) {
2469       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2470                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2471     }
2472 #endif
2473 
2474     return;
2475   }
2476 
2477   /* do cleanup and restore the parent team */
2478   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2479   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2480 
2481   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2482 
2483   /* jc: The following lock has instructions with REL and ACQ semantics,
2484      separating the parallel user code called in this parallel region
2485      from the serial user code called after this function returns. */
2486   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2487 
2488   if (!master_th->th.th_teams_microtask ||
2489       team->t.t_level > master_th->th.th_teams_level) {
2490     /* Decrement our nested depth level */
2491     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2492   }
2493   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2494 
2495 #if OMPT_SUPPORT
2496   if (ompt_enabled.enabled) {
2497     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2498     if (ompt_enabled.ompt_callback_implicit_task) {
2499       int flags = (team_microtask == (void *)__kmp_teams_master)
2500                       ? ompt_task_initial
2501                       : ompt_task_implicit;
2502       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2504           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2505           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2506     }
2507     task_info->frame.exit_frame = ompt_data_none;
2508     task_info->task_data = ompt_data_none;
2509   }
2510 #endif
2511 
2512   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2513                 master_th, team));
2514   __kmp_pop_current_task_from_thread(master_th);
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517   // Restore master thread's partition.
2518   master_th->th.th_first_place = team->t.t_first_place;
2519   master_th->th.th_last_place = team->t.t_last_place;
2520 #endif // KMP_AFFINITY_SUPPORTED
2521   master_th->th.th_def_allocator = team->t.t_def_allocator;
2522 
2523   updateHWFPControl(team);
2524 
2525   if (root->r.r_active != master_active)
2526     root->r.r_active = master_active;
2527 
2528   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2529                             master_th)); // this will free worker threads
2530 
2531   /* this race was fun to find. make sure the following is in the critical
2532      region otherwise assertions may fail occasionally since the old team may be
2533      reallocated and the hierarchy appears inconsistent. it is actually safe to
2534      run and won't cause any bugs, but will cause those assertion failures. it's
2535      only one deref&assign so might as well put this in the critical region */
2536   master_th->th.th_team = parent_team;
2537   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2538   master_th->th.th_team_master = parent_team->t.t_threads[0];
2539   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2540 
2541   /* restore serialized team, if need be */
2542   if (parent_team->t.t_serialized &&
2543       parent_team != master_th->th.th_serial_team &&
2544       parent_team != root->r.r_root_team) {
2545     __kmp_free_team(root,
2546                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2547     master_th->th.th_serial_team = parent_team;
2548   }
2549 
2550   if (__kmp_tasking_mode != tskm_immediate_exec) {
2551     if (master_th->th.th_task_state_top >
2552         0) { // Restore task state from memo stack
2553       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2554       // Remember master's state if we re-use this nested hot team
2555       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2556           master_th->th.th_task_state;
2557       --master_th->th.th_task_state_top; // pop
2558       // Now restore state at this level
2559       master_th->th.th_task_state =
2560           master_th->th
2561               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2562     }
2563     // Copy the task team from the parent team to the master thread
2564     master_th->th.th_task_team =
2565         parent_team->t.t_task_team[master_th->th.th_task_state];
2566     KA_TRACE(20,
2567              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2568               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2569               parent_team));
2570   }
2571 
2572   // TODO: GEH - cannot do this assertion because root thread not set up as
2573   // executing
2574   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2575   master_th->th.th_current_task->td_flags.executing = 1;
2576 
2577   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2578 
2579 #if OMPT_SUPPORT
2580   int flags =
2581       OMPT_INVOKER(fork_context) |
2582       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2583                                                       : ompt_parallel_team);
2584   if (ompt_enabled.enabled) {
2585     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2586                     codeptr);
2587   }
2588 #endif
2589 
2590   KMP_MB();
2591   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2592 }
2593 
2594 /* Check whether we should push an internal control record onto the
2595    serial team stack.  If so, do it.  */
2596 void __kmp_save_internal_controls(kmp_info_t *thread) {
2597 
2598   if (thread->th.th_team != thread->th.th_serial_team) {
2599     return;
2600   }
2601   if (thread->th.th_team->t.t_serialized > 1) {
2602     int push = 0;
2603 
2604     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2605       push = 1;
2606     } else {
2607       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2608           thread->th.th_team->t.t_serialized) {
2609         push = 1;
2610       }
2611     }
2612     if (push) { /* push a record on the serial team's stack */
2613       kmp_internal_control_t *control =
2614           (kmp_internal_control_t *)__kmp_allocate(
2615               sizeof(kmp_internal_control_t));
2616 
2617       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2618 
2619       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2620 
2621       control->next = thread->th.th_team->t.t_control_stack_top;
2622       thread->th.th_team->t.t_control_stack_top = control;
2623     }
2624   }
2625 }
2626 
2627 /* Changes set_nproc */
2628 void __kmp_set_num_threads(int new_nth, int gtid) {
2629   kmp_info_t *thread;
2630   kmp_root_t *root;
2631 
2632   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2633   KMP_DEBUG_ASSERT(__kmp_init_serial);
2634 
2635   if (new_nth < 1)
2636     new_nth = 1;
2637   else if (new_nth > __kmp_max_nth)
2638     new_nth = __kmp_max_nth;
2639 
2640   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2641   thread = __kmp_threads[gtid];
2642   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2643     return; // nothing to do
2644 
2645   __kmp_save_internal_controls(thread);
2646 
2647   set__nproc(thread, new_nth);
2648 
2649   // If this omp_set_num_threads() call will cause the hot team size to be
2650   // reduced (in the absence of a num_threads clause), then reduce it now,
2651   // rather than waiting for the next parallel region.
2652   root = thread->th.th_root;
2653   if (__kmp_init_parallel && (!root->r.r_active) &&
2654       (root->r.r_hot_team->t.t_nproc > new_nth)
2655 #if KMP_NESTED_HOT_TEAMS
2656       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2657 #endif
2658       ) {
2659     kmp_team_t *hot_team = root->r.r_hot_team;
2660     int f;
2661 
2662     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2663 
2664     // Release the extra threads we don't need any more.
2665     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2666       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2667       if (__kmp_tasking_mode != tskm_immediate_exec) {
2668         // When decreasing team size, threads no longer in the team should unref
2669         // task team.
2670         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2671       }
2672       __kmp_free_thread(hot_team->t.t_threads[f]);
2673       hot_team->t.t_threads[f] = NULL;
2674     }
2675     hot_team->t.t_nproc = new_nth;
2676 #if KMP_NESTED_HOT_TEAMS
2677     if (thread->th.th_hot_teams) {
2678       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2679       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2680     }
2681 #endif
2682 
2683     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2684 
2685     // Update the t_nproc field in the threads that are still active.
2686     for (f = 0; f < new_nth; f++) {
2687       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2688       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2689     }
2690     // Special flag in case omp_set_num_threads() call
2691     hot_team->t.t_size_changed = -1;
2692   }
2693 }
2694 
2695 /* Changes max_active_levels */
2696 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2697   kmp_info_t *thread;
2698 
2699   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2700                 "%d = (%d)\n",
2701                 gtid, max_active_levels));
2702   KMP_DEBUG_ASSERT(__kmp_init_serial);
2703 
2704   // validate max_active_levels
2705   if (max_active_levels < 0) {
2706     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2707     // We ignore this call if the user has specified a negative value.
2708     // The current setting won't be changed. The last valid setting will be
2709     // used. A warning will be issued (if warnings are allowed as controlled by
2710     // the KMP_WARNINGS env var).
2711     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2712                   "max_active_levels for thread %d = (%d)\n",
2713                   gtid, max_active_levels));
2714     return;
2715   }
2716   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2717     // it's OK, the max_active_levels is within the valid range: [ 0;
2718     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2719     // We allow a zero value. (implementation defined behavior)
2720   } else {
2721     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2722                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2723     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2724     // Current upper limit is MAX_INT. (implementation defined behavior)
2725     // If the input exceeds the upper limit, we correct the input to be the
2726     // upper limit. (implementation defined behavior)
2727     // Actually, the flow should never get here until we use MAX_INT limit.
2728   }
2729   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2730                 "max_active_levels for thread %d = (%d)\n",
2731                 gtid, max_active_levels));
2732 
2733   thread = __kmp_threads[gtid];
2734 
2735   __kmp_save_internal_controls(thread);
2736 
2737   set__max_active_levels(thread, max_active_levels);
2738 }
2739 
2740 /* Gets max_active_levels */
2741 int __kmp_get_max_active_levels(int gtid) {
2742   kmp_info_t *thread;
2743 
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2745   KMP_DEBUG_ASSERT(__kmp_init_serial);
2746 
2747   thread = __kmp_threads[gtid];
2748   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2749   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2750                 "curtask_maxaclevel=%d\n",
2751                 gtid, thread->th.th_current_task,
2752                 thread->th.th_current_task->td_icvs.max_active_levels));
2753   return thread->th.th_current_task->td_icvs.max_active_levels;
2754 }
2755 
2756 // nteams-var per-device ICV
2757 void __kmp_set_num_teams(int num_teams) {
2758   if (num_teams > 0)
2759     __kmp_nteams = num_teams;
2760 }
2761 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2762 // teams-thread-limit-var per-device ICV
2763 void __kmp_set_teams_thread_limit(int limit) {
2764   if (limit > 0)
2765     __kmp_teams_thread_limit = limit;
2766 }
2767 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2768 
2769 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2770 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2771 
2772 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2773 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2774   kmp_info_t *thread;
2775   kmp_sched_t orig_kind;
2776   //    kmp_team_t *team;
2777 
2778   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2779                 gtid, (int)kind, chunk));
2780   KMP_DEBUG_ASSERT(__kmp_init_serial);
2781 
2782   // Check if the kind parameter is valid, correct if needed.
2783   // Valid parameters should fit in one of two intervals - standard or extended:
2784   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2785   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2786   orig_kind = kind;
2787   kind = __kmp_sched_without_mods(kind);
2788 
2789   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2790       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2791     // TODO: Hint needs attention in case we change the default schedule.
2792     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2793               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2794               __kmp_msg_null);
2795     kind = kmp_sched_default;
2796     chunk = 0; // ignore chunk value in case of bad kind
2797   }
2798 
2799   thread = __kmp_threads[gtid];
2800 
2801   __kmp_save_internal_controls(thread);
2802 
2803   if (kind < kmp_sched_upper_std) {
2804     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2805       // differ static chunked vs. unchunked:  chunk should be invalid to
2806       // indicate unchunked schedule (which is the default)
2807       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2808     } else {
2809       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2810           __kmp_sch_map[kind - kmp_sched_lower - 1];
2811     }
2812   } else {
2813     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2814     //    kmp_sched_lower - 2 ];
2815     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2816         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2817                       kmp_sched_lower - 2];
2818   }
2819   __kmp_sched_apply_mods_intkind(
2820       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2821   if (kind == kmp_sched_auto || chunk < 1) {
2822     // ignore parameter chunk for schedule auto
2823     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2824   } else {
2825     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2826   }
2827 }
2828 
2829 /* Gets def_sched_var ICV values */
2830 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2831   kmp_info_t *thread;
2832   enum sched_type th_type;
2833 
2834   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2835   KMP_DEBUG_ASSERT(__kmp_init_serial);
2836 
2837   thread = __kmp_threads[gtid];
2838 
2839   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2840   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2841   case kmp_sch_static:
2842   case kmp_sch_static_greedy:
2843   case kmp_sch_static_balanced:
2844     *kind = kmp_sched_static;
2845     __kmp_sched_apply_mods_stdkind(kind, th_type);
2846     *chunk = 0; // chunk was not set, try to show this fact via zero value
2847     return;
2848   case kmp_sch_static_chunked:
2849     *kind = kmp_sched_static;
2850     break;
2851   case kmp_sch_dynamic_chunked:
2852     *kind = kmp_sched_dynamic;
2853     break;
2854   case kmp_sch_guided_chunked:
2855   case kmp_sch_guided_iterative_chunked:
2856   case kmp_sch_guided_analytical_chunked:
2857     *kind = kmp_sched_guided;
2858     break;
2859   case kmp_sch_auto:
2860     *kind = kmp_sched_auto;
2861     break;
2862   case kmp_sch_trapezoidal:
2863     *kind = kmp_sched_trapezoidal;
2864     break;
2865 #if KMP_STATIC_STEAL_ENABLED
2866   case kmp_sch_static_steal:
2867     *kind = kmp_sched_static_steal;
2868     break;
2869 #endif
2870   default:
2871     KMP_FATAL(UnknownSchedulingType, th_type);
2872   }
2873 
2874   __kmp_sched_apply_mods_stdkind(kind, th_type);
2875   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2876 }
2877 
2878 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2879 
2880   int ii, dd;
2881   kmp_team_t *team;
2882   kmp_info_t *thr;
2883 
2884   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2885   KMP_DEBUG_ASSERT(__kmp_init_serial);
2886 
2887   // validate level
2888   if (level == 0)
2889     return 0;
2890   if (level < 0)
2891     return -1;
2892   thr = __kmp_threads[gtid];
2893   team = thr->th.th_team;
2894   ii = team->t.t_level;
2895   if (level > ii)
2896     return -1;
2897 
2898   if (thr->th.th_teams_microtask) {
2899     // AC: we are in teams region where multiple nested teams have same level
2900     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2901     if (level <=
2902         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2903       KMP_DEBUG_ASSERT(ii >= tlevel);
2904       // AC: As we need to pass by the teams league, we need to artificially
2905       // increase ii
2906       if (ii == tlevel) {
2907         ii += 2; // three teams have same level
2908       } else {
2909         ii++; // two teams have same level
2910       }
2911     }
2912   }
2913 
2914   if (ii == level)
2915     return __kmp_tid_from_gtid(gtid);
2916 
2917   dd = team->t.t_serialized;
2918   level++;
2919   while (ii > level) {
2920     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2921     }
2922     if ((team->t.t_serialized) && (!dd)) {
2923       team = team->t.t_parent;
2924       continue;
2925     }
2926     if (ii > level) {
2927       team = team->t.t_parent;
2928       dd = team->t.t_serialized;
2929       ii--;
2930     }
2931   }
2932 
2933   return (dd > 1) ? (0) : (team->t.t_master_tid);
2934 }
2935 
2936 int __kmp_get_team_size(int gtid, int level) {
2937 
2938   int ii, dd;
2939   kmp_team_t *team;
2940   kmp_info_t *thr;
2941 
2942   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2943   KMP_DEBUG_ASSERT(__kmp_init_serial);
2944 
2945   // validate level
2946   if (level == 0)
2947     return 1;
2948   if (level < 0)
2949     return -1;
2950   thr = __kmp_threads[gtid];
2951   team = thr->th.th_team;
2952   ii = team->t.t_level;
2953   if (level > ii)
2954     return -1;
2955 
2956   if (thr->th.th_teams_microtask) {
2957     // AC: we are in teams region where multiple nested teams have same level
2958     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2959     if (level <=
2960         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2961       KMP_DEBUG_ASSERT(ii >= tlevel);
2962       // AC: As we need to pass by the teams league, we need to artificially
2963       // increase ii
2964       if (ii == tlevel) {
2965         ii += 2; // three teams have same level
2966       } else {
2967         ii++; // two teams have same level
2968       }
2969     }
2970   }
2971 
2972   while (ii > level) {
2973     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2974     }
2975     if (team->t.t_serialized && (!dd)) {
2976       team = team->t.t_parent;
2977       continue;
2978     }
2979     if (ii > level) {
2980       team = team->t.t_parent;
2981       ii--;
2982     }
2983   }
2984 
2985   return team->t.t_nproc;
2986 }
2987 
2988 kmp_r_sched_t __kmp_get_schedule_global() {
2989   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2990   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2991   // independently. So one can get the updated schedule here.
2992 
2993   kmp_r_sched_t r_sched;
2994 
2995   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2996   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2997   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2998   // different roots (even in OMP 2.5)
2999   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3000   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3001   if (s == kmp_sch_static) {
3002     // replace STATIC with more detailed schedule (balanced or greedy)
3003     r_sched.r_sched_type = __kmp_static;
3004   } else if (s == kmp_sch_guided_chunked) {
3005     // replace GUIDED with more detailed schedule (iterative or analytical)
3006     r_sched.r_sched_type = __kmp_guided;
3007   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3008     r_sched.r_sched_type = __kmp_sched;
3009   }
3010   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3011 
3012   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3013     // __kmp_chunk may be wrong here (if it was not ever set)
3014     r_sched.chunk = KMP_DEFAULT_CHUNK;
3015   } else {
3016     r_sched.chunk = __kmp_chunk;
3017   }
3018 
3019   return r_sched;
3020 }
3021 
3022 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3023    at least argc number of *t_argv entries for the requested team. */
3024 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3025 
3026   KMP_DEBUG_ASSERT(team);
3027   if (!realloc || argc > team->t.t_max_argc) {
3028 
3029     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3030                    "current entries=%d\n",
3031                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3032     /* if previously allocated heap space for args, free them */
3033     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3034       __kmp_free((void *)team->t.t_argv);
3035 
3036     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3037       /* use unused space in the cache line for arguments */
3038       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3039       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3040                      "argv entries\n",
3041                      team->t.t_id, team->t.t_max_argc));
3042       team->t.t_argv = &team->t.t_inline_argv[0];
3043       if (__kmp_storage_map) {
3044         __kmp_print_storage_map_gtid(
3045             -1, &team->t.t_inline_argv[0],
3046             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3047             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3048             team->t.t_id);
3049       }
3050     } else {
3051       /* allocate space for arguments in the heap */
3052       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3053                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3054                                : 2 * argc;
3055       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3056                      "argv entries\n",
3057                      team->t.t_id, team->t.t_max_argc));
3058       team->t.t_argv =
3059           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3060       if (__kmp_storage_map) {
3061         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3062                                      &team->t.t_argv[team->t.t_max_argc],
3063                                      sizeof(void *) * team->t.t_max_argc,
3064                                      "team_%d.t_argv", team->t.t_id);
3065       }
3066     }
3067   }
3068 }
3069 
3070 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3071   int i;
3072   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3073   team->t.t_threads =
3074       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3075   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3076       sizeof(dispatch_shared_info_t) * num_disp_buff);
3077   team->t.t_dispatch =
3078       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3079   team->t.t_implicit_task_taskdata =
3080       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3081   team->t.t_max_nproc = max_nth;
3082 
3083   /* setup dispatch buffers */
3084   for (i = 0; i < num_disp_buff; ++i) {
3085     team->t.t_disp_buffer[i].buffer_index = i;
3086     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3087   }
3088 }
3089 
3090 static void __kmp_free_team_arrays(kmp_team_t *team) {
3091   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3092   int i;
3093   for (i = 0; i < team->t.t_max_nproc; ++i) {
3094     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3095       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3096       team->t.t_dispatch[i].th_disp_buffer = NULL;
3097     }
3098   }
3099 #if KMP_USE_HIER_SCHED
3100   __kmp_dispatch_free_hierarchies(team);
3101 #endif
3102   __kmp_free(team->t.t_threads);
3103   __kmp_free(team->t.t_disp_buffer);
3104   __kmp_free(team->t.t_dispatch);
3105   __kmp_free(team->t.t_implicit_task_taskdata);
3106   team->t.t_threads = NULL;
3107   team->t.t_disp_buffer = NULL;
3108   team->t.t_dispatch = NULL;
3109   team->t.t_implicit_task_taskdata = 0;
3110 }
3111 
3112 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3113   kmp_info_t **oldThreads = team->t.t_threads;
3114 
3115   __kmp_free(team->t.t_disp_buffer);
3116   __kmp_free(team->t.t_dispatch);
3117   __kmp_free(team->t.t_implicit_task_taskdata);
3118   __kmp_allocate_team_arrays(team, max_nth);
3119 
3120   KMP_MEMCPY(team->t.t_threads, oldThreads,
3121              team->t.t_nproc * sizeof(kmp_info_t *));
3122 
3123   __kmp_free(oldThreads);
3124 }
3125 
3126 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3127 
3128   kmp_r_sched_t r_sched =
3129       __kmp_get_schedule_global(); // get current state of scheduling globals
3130 
3131   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3132 
3133   kmp_internal_control_t g_icvs = {
3134     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3135     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3136     // adjustment of threads (per thread)
3137     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3138     // whether blocktime is explicitly set
3139     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3140 #if KMP_USE_MONITOR
3141     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3142 // intervals
3143 #endif
3144     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3145     // next parallel region (per thread)
3146     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3147     __kmp_cg_max_nth, // int thread_limit;
3148     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3149     // for max_active_levels
3150     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3151     // {sched,chunk} pair
3152     __kmp_nested_proc_bind.bind_types[0],
3153     __kmp_default_device,
3154     NULL // struct kmp_internal_control *next;
3155   };
3156 
3157   return g_icvs;
3158 }
3159 
3160 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3161 
3162   kmp_internal_control_t gx_icvs;
3163   gx_icvs.serial_nesting_level =
3164       0; // probably =team->t.t_serial like in save_inter_controls
3165   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3166   gx_icvs.next = NULL;
3167 
3168   return gx_icvs;
3169 }
3170 
3171 static void __kmp_initialize_root(kmp_root_t *root) {
3172   int f;
3173   kmp_team_t *root_team;
3174   kmp_team_t *hot_team;
3175   int hot_team_max_nth;
3176   kmp_r_sched_t r_sched =
3177       __kmp_get_schedule_global(); // get current state of scheduling globals
3178   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3179   KMP_DEBUG_ASSERT(root);
3180   KMP_ASSERT(!root->r.r_begin);
3181 
3182   /* setup the root state structure */
3183   __kmp_init_lock(&root->r.r_begin_lock);
3184   root->r.r_begin = FALSE;
3185   root->r.r_active = FALSE;
3186   root->r.r_in_parallel = 0;
3187   root->r.r_blocktime = __kmp_dflt_blocktime;
3188 
3189   /* setup the root team for this task */
3190   /* allocate the root team structure */
3191   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3192 
3193   root_team =
3194       __kmp_allocate_team(root,
3195                           1, // new_nproc
3196                           1, // max_nproc
3197 #if OMPT_SUPPORT
3198                           ompt_data_none, // root parallel id
3199 #endif
3200                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3201                           0 // argc
3202                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3203                           );
3204 #if USE_DEBUGGER
3205   // Non-NULL value should be assigned to make the debugger display the root
3206   // team.
3207   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3208 #endif
3209 
3210   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3211 
3212   root->r.r_root_team = root_team;
3213   root_team->t.t_control_stack_top = NULL;
3214 
3215   /* initialize root team */
3216   root_team->t.t_threads[0] = NULL;
3217   root_team->t.t_nproc = 1;
3218   root_team->t.t_serialized = 1;
3219   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3220   root_team->t.t_sched.sched = r_sched.sched;
3221   KA_TRACE(
3222       20,
3223       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3224        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3225 
3226   /* setup the  hot team for this task */
3227   /* allocate the hot team structure */
3228   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3229 
3230   hot_team =
3231       __kmp_allocate_team(root,
3232                           1, // new_nproc
3233                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3234 #if OMPT_SUPPORT
3235                           ompt_data_none, // root parallel id
3236 #endif
3237                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3238                           0 // argc
3239                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3240                           );
3241   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3242 
3243   root->r.r_hot_team = hot_team;
3244   root_team->t.t_control_stack_top = NULL;
3245 
3246   /* first-time initialization */
3247   hot_team->t.t_parent = root_team;
3248 
3249   /* initialize hot team */
3250   hot_team_max_nth = hot_team->t.t_max_nproc;
3251   for (f = 0; f < hot_team_max_nth; ++f) {
3252     hot_team->t.t_threads[f] = NULL;
3253   }
3254   hot_team->t.t_nproc = 1;
3255   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3256   hot_team->t.t_sched.sched = r_sched.sched;
3257   hot_team->t.t_size_changed = 0;
3258 }
3259 
3260 #ifdef KMP_DEBUG
3261 
3262 typedef struct kmp_team_list_item {
3263   kmp_team_p const *entry;
3264   struct kmp_team_list_item *next;
3265 } kmp_team_list_item_t;
3266 typedef kmp_team_list_item_t *kmp_team_list_t;
3267 
3268 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3269     kmp_team_list_t list, // List of teams.
3270     kmp_team_p const *team // Team to add.
3271     ) {
3272 
3273   // List must terminate with item where both entry and next are NULL.
3274   // Team is added to the list only once.
3275   // List is sorted in ascending order by team id.
3276   // Team id is *not* a key.
3277 
3278   kmp_team_list_t l;
3279 
3280   KMP_DEBUG_ASSERT(list != NULL);
3281   if (team == NULL) {
3282     return;
3283   }
3284 
3285   __kmp_print_structure_team_accum(list, team->t.t_parent);
3286   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3287 
3288   // Search list for the team.
3289   l = list;
3290   while (l->next != NULL && l->entry != team) {
3291     l = l->next;
3292   }
3293   if (l->next != NULL) {
3294     return; // Team has been added before, exit.
3295   }
3296 
3297   // Team is not found. Search list again for insertion point.
3298   l = list;
3299   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3300     l = l->next;
3301   }
3302 
3303   // Insert team.
3304   {
3305     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3306         sizeof(kmp_team_list_item_t));
3307     *item = *l;
3308     l->entry = team;
3309     l->next = item;
3310   }
3311 }
3312 
3313 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3314 
3315                                        ) {
3316   __kmp_printf("%s", title);
3317   if (team != NULL) {
3318     __kmp_printf("%2x %p\n", team->t.t_id, team);
3319   } else {
3320     __kmp_printf(" - (nil)\n");
3321   }
3322 }
3323 
3324 static void __kmp_print_structure_thread(char const *title,
3325                                          kmp_info_p const *thread) {
3326   __kmp_printf("%s", title);
3327   if (thread != NULL) {
3328     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3329   } else {
3330     __kmp_printf(" - (nil)\n");
3331   }
3332 }
3333 
3334 void __kmp_print_structure(void) {
3335 
3336   kmp_team_list_t list;
3337 
3338   // Initialize list of teams.
3339   list =
3340       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3341   list->entry = NULL;
3342   list->next = NULL;
3343 
3344   __kmp_printf("\n------------------------------\nGlobal Thread "
3345                "Table\n------------------------------\n");
3346   {
3347     int gtid;
3348     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3349       __kmp_printf("%2d", gtid);
3350       if (__kmp_threads != NULL) {
3351         __kmp_printf(" %p", __kmp_threads[gtid]);
3352       }
3353       if (__kmp_root != NULL) {
3354         __kmp_printf(" %p", __kmp_root[gtid]);
3355       }
3356       __kmp_printf("\n");
3357     }
3358   }
3359 
3360   // Print out __kmp_threads array.
3361   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3362                "----------\n");
3363   if (__kmp_threads != NULL) {
3364     int gtid;
3365     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3366       kmp_info_t const *thread = __kmp_threads[gtid];
3367       if (thread != NULL) {
3368         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3369         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3370         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3371         __kmp_print_structure_team("    Serial Team:  ",
3372                                    thread->th.th_serial_team);
3373         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3374         __kmp_print_structure_thread("    Master:       ",
3375                                      thread->th.th_team_master);
3376         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3377         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3378         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3379         __kmp_print_structure_thread("    Next in pool: ",
3380                                      thread->th.th_next_pool);
3381         __kmp_printf("\n");
3382         __kmp_print_structure_team_accum(list, thread->th.th_team);
3383         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3384       }
3385     }
3386   } else {
3387     __kmp_printf("Threads array is not allocated.\n");
3388   }
3389 
3390   // Print out __kmp_root array.
3391   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3392                "--------\n");
3393   if (__kmp_root != NULL) {
3394     int gtid;
3395     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3396       kmp_root_t const *root = __kmp_root[gtid];
3397       if (root != NULL) {
3398         __kmp_printf("GTID %2d %p:\n", gtid, root);
3399         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3400         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3401         __kmp_print_structure_thread("    Uber Thread:  ",
3402                                      root->r.r_uber_thread);
3403         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3404         __kmp_printf("    In Parallel:  %2d\n",
3405                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3406         __kmp_printf("\n");
3407         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3408         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3409       }
3410     }
3411   } else {
3412     __kmp_printf("Ubers array is not allocated.\n");
3413   }
3414 
3415   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3416                "--------\n");
3417   while (list->next != NULL) {
3418     kmp_team_p const *team = list->entry;
3419     int i;
3420     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3421     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3422     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3423     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3424     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3425     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3426     for (i = 0; i < team->t.t_nproc; ++i) {
3427       __kmp_printf("    Thread %2d:      ", i);
3428       __kmp_print_structure_thread("", team->t.t_threads[i]);
3429     }
3430     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3431     __kmp_printf("\n");
3432     list = list->next;
3433   }
3434 
3435   // Print out __kmp_thread_pool and __kmp_team_pool.
3436   __kmp_printf("\n------------------------------\nPools\n----------------------"
3437                "--------\n");
3438   __kmp_print_structure_thread("Thread pool:          ",
3439                                CCAST(kmp_info_t *, __kmp_thread_pool));
3440   __kmp_print_structure_team("Team pool:            ",
3441                              CCAST(kmp_team_t *, __kmp_team_pool));
3442   __kmp_printf("\n");
3443 
3444   // Free team list.
3445   while (list != NULL) {
3446     kmp_team_list_item_t *item = list;
3447     list = list->next;
3448     KMP_INTERNAL_FREE(item);
3449   }
3450 }
3451 
3452 #endif
3453 
3454 //---------------------------------------------------------------------------
3455 //  Stuff for per-thread fast random number generator
3456 //  Table of primes
3457 static const unsigned __kmp_primes[] = {
3458     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3459     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3460     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3461     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3462     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3463     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3464     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3465     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3466     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3467     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3468     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3469 
3470 //---------------------------------------------------------------------------
3471 //  __kmp_get_random: Get a random number using a linear congruential method.
3472 unsigned short __kmp_get_random(kmp_info_t *thread) {
3473   unsigned x = thread->th.th_x;
3474   unsigned short r = (unsigned short)(x >> 16);
3475 
3476   thread->th.th_x = x * thread->th.th_a + 1;
3477 
3478   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3479                 thread->th.th_info.ds.ds_tid, r));
3480 
3481   return r;
3482 }
3483 //--------------------------------------------------------
3484 // __kmp_init_random: Initialize a random number generator
3485 void __kmp_init_random(kmp_info_t *thread) {
3486   unsigned seed = thread->th.th_info.ds.ds_tid;
3487 
3488   thread->th.th_a =
3489       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3490   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3491   KA_TRACE(30,
3492            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3493 }
3494 
3495 #if KMP_OS_WINDOWS
3496 /* reclaim array entries for root threads that are already dead, returns number
3497  * reclaimed */
3498 static int __kmp_reclaim_dead_roots(void) {
3499   int i, r = 0;
3500 
3501   for (i = 0; i < __kmp_threads_capacity; ++i) {
3502     if (KMP_UBER_GTID(i) &&
3503         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3504         !__kmp_root[i]
3505              ->r.r_active) { // AC: reclaim only roots died in non-active state
3506       r += __kmp_unregister_root_other_thread(i);
3507     }
3508   }
3509   return r;
3510 }
3511 #endif
3512 
3513 /* This function attempts to create free entries in __kmp_threads and
3514    __kmp_root, and returns the number of free entries generated.
3515 
3516    For Windows* OS static library, the first mechanism used is to reclaim array
3517    entries for root threads that are already dead.
3518 
3519    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3520    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3521    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3522    threadprivate cache array has been created. Synchronization with
3523    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3524 
3525    After any dead root reclamation, if the clipping value allows array expansion
3526    to result in the generation of a total of nNeed free slots, the function does
3527    that expansion. If not, nothing is done beyond the possible initial root
3528    thread reclamation.
3529 
3530    If any argument is negative, the behavior is undefined. */
3531 static int __kmp_expand_threads(int nNeed) {
3532   int added = 0;
3533   int minimumRequiredCapacity;
3534   int newCapacity;
3535   kmp_info_t **newThreads;
3536   kmp_root_t **newRoot;
3537 
3538 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3539 // resizing __kmp_threads does not need additional protection if foreign
3540 // threads are present
3541 
3542 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3543   /* only for Windows static library */
3544   /* reclaim array entries for root threads that are already dead */
3545   added = __kmp_reclaim_dead_roots();
3546 
3547   if (nNeed) {
3548     nNeed -= added;
3549     if (nNeed < 0)
3550       nNeed = 0;
3551   }
3552 #endif
3553   if (nNeed <= 0)
3554     return added;
3555 
3556   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3557   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3558   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3559   // > __kmp_max_nth in one of two ways:
3560   //
3561   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3562   //    may not be reused by another thread, so we may need to increase
3563   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3564   //
3565   // 2) New foreign root(s) are encountered.  We always register new foreign
3566   //    roots. This may cause a smaller # of threads to be allocated at
3567   //    subsequent parallel regions, but the worker threads hang around (and
3568   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3569   //
3570   // Anyway, that is the reason for moving the check to see if
3571   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3572   // instead of having it performed here. -BB
3573 
3574   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3575 
3576   /* compute expansion headroom to check if we can expand */
3577   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3578     /* possible expansion too small -- give up */
3579     return added;
3580   }
3581   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3582 
3583   newCapacity = __kmp_threads_capacity;
3584   do {
3585     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3586                                                           : __kmp_sys_max_nth;
3587   } while (newCapacity < minimumRequiredCapacity);
3588   newThreads = (kmp_info_t **)__kmp_allocate(
3589       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3590   newRoot =
3591       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3592   KMP_MEMCPY(newThreads, __kmp_threads,
3593              __kmp_threads_capacity * sizeof(kmp_info_t *));
3594   KMP_MEMCPY(newRoot, __kmp_root,
3595              __kmp_threads_capacity * sizeof(kmp_root_t *));
3596 
3597   kmp_info_t **temp_threads = __kmp_threads;
3598   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3599   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3600   __kmp_free(temp_threads);
3601   added += newCapacity - __kmp_threads_capacity;
3602   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3603 
3604   if (newCapacity > __kmp_tp_capacity) {
3605     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3606     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3607       __kmp_threadprivate_resize_cache(newCapacity);
3608     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3609       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3610     }
3611     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3612   }
3613 
3614   return added;
3615 }
3616 
3617 /* Register the current thread as a root thread and obtain our gtid. We must
3618    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3619    thread that calls from __kmp_do_serial_initialize() */
3620 int __kmp_register_root(int initial_thread) {
3621   kmp_info_t *root_thread;
3622   kmp_root_t *root;
3623   int gtid;
3624   int capacity;
3625   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3626   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3627   KMP_MB();
3628 
3629   /* 2007-03-02:
3630      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3631      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3632      work as expected -- it may return false (that means there is at least one
3633      empty slot in __kmp_threads array), but it is possible the only free slot
3634      is #0, which is reserved for initial thread and so cannot be used for this
3635      one. Following code workarounds this bug.
3636 
3637      However, right solution seems to be not reserving slot #0 for initial
3638      thread because:
3639      (1) there is no magic in slot #0,
3640      (2) we cannot detect initial thread reliably (the first thread which does
3641         serial initialization may be not a real initial thread).
3642   */
3643   capacity = __kmp_threads_capacity;
3644   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3645     --capacity;
3646   }
3647 
3648   /* see if there are too many threads */
3649   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3650     if (__kmp_tp_cached) {
3651       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3652                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3653                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3654     } else {
3655       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3656                   __kmp_msg_null);
3657     }
3658   }
3659 
3660   // When hidden helper task is enabled, __kmp_threads is organized as follows:
3661   // 0: initial thread, also a regular OpenMP thread.
3662   // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3663   // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3664   // regular OpenMP threads.
3665   if (TCR_4(__kmp_init_hidden_helper_threads)) {
3666     // Find an available thread slot for hidden helper thread. Slots for hidden
3667     // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3668     for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3669                    gtid <= __kmp_hidden_helper_threads_num;
3670          gtid++)
3671       ;
3672     KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3673     KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3674                  "hidden helper thread: T#%d\n",
3675                  gtid));
3676   } else {
3677     /* find an available thread slot */
3678     // Don't reassign the zero slot since we need that to only be used by
3679     // initial thread. Slots for hidden helper threads should also be skipped.
3680     if (initial_thread && __kmp_threads[0] == NULL) {
3681       gtid = 0;
3682     } else {
3683       for (gtid = __kmp_hidden_helper_threads_num + 1;
3684            TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3685         ;
3686     }
3687     KA_TRACE(
3688         1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3689     KMP_ASSERT(gtid < __kmp_threads_capacity);
3690   }
3691 
3692   /* update global accounting */
3693   __kmp_all_nth++;
3694   TCW_4(__kmp_nth, __kmp_nth + 1);
3695 
3696   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3697   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3698   if (__kmp_adjust_gtid_mode) {
3699     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3700       if (TCR_4(__kmp_gtid_mode) != 2) {
3701         TCW_4(__kmp_gtid_mode, 2);
3702       }
3703     } else {
3704       if (TCR_4(__kmp_gtid_mode) != 1) {
3705         TCW_4(__kmp_gtid_mode, 1);
3706       }
3707     }
3708   }
3709 
3710 #ifdef KMP_ADJUST_BLOCKTIME
3711   /* Adjust blocktime to zero if necessary            */
3712   /* Middle initialization might not have occurred yet */
3713   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3714     if (__kmp_nth > __kmp_avail_proc) {
3715       __kmp_zero_bt = TRUE;
3716     }
3717   }
3718 #endif /* KMP_ADJUST_BLOCKTIME */
3719 
3720   /* setup this new hierarchy */
3721   if (!(root = __kmp_root[gtid])) {
3722     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3723     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3724   }
3725 
3726 #if KMP_STATS_ENABLED
3727   // Initialize stats as soon as possible (right after gtid assignment).
3728   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3729   __kmp_stats_thread_ptr->startLife();
3730   KMP_SET_THREAD_STATE(SERIAL_REGION);
3731   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3732 #endif
3733   __kmp_initialize_root(root);
3734 
3735   /* setup new root thread structure */
3736   if (root->r.r_uber_thread) {
3737     root_thread = root->r.r_uber_thread;
3738   } else {
3739     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3740     if (__kmp_storage_map) {
3741       __kmp_print_thread_storage_map(root_thread, gtid);
3742     }
3743     root_thread->th.th_info.ds.ds_gtid = gtid;
3744 #if OMPT_SUPPORT
3745     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3746 #endif
3747     root_thread->th.th_root = root;
3748     if (__kmp_env_consistency_check) {
3749       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3750     }
3751 #if USE_FAST_MEMORY
3752     __kmp_initialize_fast_memory(root_thread);
3753 #endif /* USE_FAST_MEMORY */
3754 
3755 #if KMP_USE_BGET
3756     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3757     __kmp_initialize_bget(root_thread);
3758 #endif
3759     __kmp_init_random(root_thread); // Initialize random number generator
3760   }
3761 
3762   /* setup the serial team held in reserve by the root thread */
3763   if (!root_thread->th.th_serial_team) {
3764     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3765     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3766     root_thread->th.th_serial_team = __kmp_allocate_team(
3767         root, 1, 1,
3768 #if OMPT_SUPPORT
3769         ompt_data_none, // root parallel id
3770 #endif
3771         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3772   }
3773   KMP_ASSERT(root_thread->th.th_serial_team);
3774   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3775                 root_thread->th.th_serial_team));
3776 
3777   /* drop root_thread into place */
3778   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3779 
3780   root->r.r_root_team->t.t_threads[0] = root_thread;
3781   root->r.r_hot_team->t.t_threads[0] = root_thread;
3782   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3783   // AC: the team created in reserve, not for execution (it is unused for now).
3784   root_thread->th.th_serial_team->t.t_serialized = 0;
3785   root->r.r_uber_thread = root_thread;
3786 
3787   /* initialize the thread, get it ready to go */
3788   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3789   TCW_4(__kmp_init_gtid, TRUE);
3790 
3791   /* prepare the master thread for get_gtid() */
3792   __kmp_gtid_set_specific(gtid);
3793 
3794 #if USE_ITT_BUILD
3795   __kmp_itt_thread_name(gtid);
3796 #endif /* USE_ITT_BUILD */
3797 
3798 #ifdef KMP_TDATA_GTID
3799   __kmp_gtid = gtid;
3800 #endif
3801   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3802   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3803 
3804   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3805                 "plain=%u\n",
3806                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3807                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3808                 KMP_INIT_BARRIER_STATE));
3809   { // Initialize barrier data.
3810     int b;
3811     for (b = 0; b < bs_last_barrier; ++b) {
3812       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3813 #if USE_DEBUGGER
3814       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3815 #endif
3816     }
3817   }
3818   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3819                    KMP_INIT_BARRIER_STATE);
3820 
3821 #if KMP_AFFINITY_SUPPORTED
3822   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3823   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3824   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3825   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3826   if (TCR_4(__kmp_init_middle)) {
3827     __kmp_affinity_set_init_mask(gtid, TRUE);
3828   }
3829 #endif /* KMP_AFFINITY_SUPPORTED */
3830   root_thread->th.th_def_allocator = __kmp_def_allocator;
3831   root_thread->th.th_prev_level = 0;
3832   root_thread->th.th_prev_num_threads = 1;
3833 
3834   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3835   tmp->cg_root = root_thread;
3836   tmp->cg_thread_limit = __kmp_cg_max_nth;
3837   tmp->cg_nthreads = 1;
3838   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3839                  " cg_nthreads init to 1\n",
3840                  root_thread, tmp));
3841   tmp->up = NULL;
3842   root_thread->th.th_cg_roots = tmp;
3843 
3844   __kmp_root_counter++;
3845 
3846 #if OMPT_SUPPORT
3847   if (!initial_thread && ompt_enabled.enabled) {
3848 
3849     kmp_info_t *root_thread = ompt_get_thread();
3850 
3851     ompt_set_thread_state(root_thread, ompt_state_overhead);
3852 
3853     if (ompt_enabled.ompt_callback_thread_begin) {
3854       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3855           ompt_thread_initial, __ompt_get_thread_data_internal());
3856     }
3857     ompt_data_t *task_data;
3858     ompt_data_t *parallel_data;
3859     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3860     if (ompt_enabled.ompt_callback_implicit_task) {
3861       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3862           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3863     }
3864 
3865     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3866   }
3867 #endif
3868 
3869   KMP_MB();
3870   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3871 
3872   return gtid;
3873 }
3874 
3875 #if KMP_NESTED_HOT_TEAMS
3876 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3877                                 const int max_level) {
3878   int i, n, nth;
3879   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3880   if (!hot_teams || !hot_teams[level].hot_team) {
3881     return 0;
3882   }
3883   KMP_DEBUG_ASSERT(level < max_level);
3884   kmp_team_t *team = hot_teams[level].hot_team;
3885   nth = hot_teams[level].hot_team_nth;
3886   n = nth - 1; // master is not freed
3887   if (level < max_level - 1) {
3888     for (i = 0; i < nth; ++i) {
3889       kmp_info_t *th = team->t.t_threads[i];
3890       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3891       if (i > 0 && th->th.th_hot_teams) {
3892         __kmp_free(th->th.th_hot_teams);
3893         th->th.th_hot_teams = NULL;
3894       }
3895     }
3896   }
3897   __kmp_free_team(root, team, NULL);
3898   return n;
3899 }
3900 #endif
3901 
3902 // Resets a root thread and clear its root and hot teams.
3903 // Returns the number of __kmp_threads entries directly and indirectly freed.
3904 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3905   kmp_team_t *root_team = root->r.r_root_team;
3906   kmp_team_t *hot_team = root->r.r_hot_team;
3907   int n = hot_team->t.t_nproc;
3908   int i;
3909 
3910   KMP_DEBUG_ASSERT(!root->r.r_active);
3911 
3912   root->r.r_root_team = NULL;
3913   root->r.r_hot_team = NULL;
3914   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3915   // before call to __kmp_free_team().
3916   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3917 #if KMP_NESTED_HOT_TEAMS
3918   if (__kmp_hot_teams_max_level >
3919       0) { // need to free nested hot teams and their threads if any
3920     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3921       kmp_info_t *th = hot_team->t.t_threads[i];
3922       if (__kmp_hot_teams_max_level > 1) {
3923         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3924       }
3925       if (th->th.th_hot_teams) {
3926         __kmp_free(th->th.th_hot_teams);
3927         th->th.th_hot_teams = NULL;
3928       }
3929     }
3930   }
3931 #endif
3932   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3933 
3934   // Before we can reap the thread, we need to make certain that all other
3935   // threads in the teams that had this root as ancestor have stopped trying to
3936   // steal tasks.
3937   if (__kmp_tasking_mode != tskm_immediate_exec) {
3938     __kmp_wait_to_unref_task_teams();
3939   }
3940 
3941 #if KMP_OS_WINDOWS
3942   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3943   KA_TRACE(
3944       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3945            "\n",
3946            (LPVOID) & (root->r.r_uber_thread->th),
3947            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3948   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3949 #endif /* KMP_OS_WINDOWS */
3950 
3951 #if OMPT_SUPPORT
3952   ompt_data_t *task_data;
3953   ompt_data_t *parallel_data;
3954   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3955   if (ompt_enabled.ompt_callback_implicit_task) {
3956     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3957         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3958   }
3959   if (ompt_enabled.ompt_callback_thread_end) {
3960     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3961         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3962   }
3963 #endif
3964 
3965   TCW_4(__kmp_nth,
3966         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3967   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3968   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3969                  " to %d\n",
3970                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3971                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3972   if (i == 1) {
3973     // need to free contention group structure
3974     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3975                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3976     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3977     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3978     root->r.r_uber_thread->th.th_cg_roots = NULL;
3979   }
3980   __kmp_reap_thread(root->r.r_uber_thread, 1);
3981 
3982   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3983   // instead of freeing.
3984   root->r.r_uber_thread = NULL;
3985   /* mark root as no longer in use */
3986   root->r.r_begin = FALSE;
3987 
3988   return n;
3989 }
3990 
3991 void __kmp_unregister_root_current_thread(int gtid) {
3992   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3993   /* this lock should be ok, since unregister_root_current_thread is never
3994      called during an abort, only during a normal close. furthermore, if you
3995      have the forkjoin lock, you should never try to get the initz lock */
3996   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3997   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3998     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3999                   "exiting T#%d\n",
4000                   gtid));
4001     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4002     return;
4003   }
4004   kmp_root_t *root = __kmp_root[gtid];
4005 
4006   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4007   KMP_ASSERT(KMP_UBER_GTID(gtid));
4008   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4009   KMP_ASSERT(root->r.r_active == FALSE);
4010 
4011   KMP_MB();
4012 
4013   kmp_info_t *thread = __kmp_threads[gtid];
4014   kmp_team_t *team = thread->th.th_team;
4015   kmp_task_team_t *task_team = thread->th.th_task_team;
4016 
4017   // we need to wait for the proxy tasks before finishing the thread
4018   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4019 #if OMPT_SUPPORT
4020     // the runtime is shutting down so we won't report any events
4021     thread->th.ompt_thread_info.state = ompt_state_undefined;
4022 #endif
4023     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4024   }
4025 
4026   __kmp_reset_root(gtid, root);
4027 
4028   KMP_MB();
4029   KC_TRACE(10,
4030            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4031 
4032   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4033 }
4034 
4035 #if KMP_OS_WINDOWS
4036 /* __kmp_forkjoin_lock must be already held
4037    Unregisters a root thread that is not the current thread.  Returns the number
4038    of __kmp_threads entries freed as a result. */
4039 static int __kmp_unregister_root_other_thread(int gtid) {
4040   kmp_root_t *root = __kmp_root[gtid];
4041   int r;
4042 
4043   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4044   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4045   KMP_ASSERT(KMP_UBER_GTID(gtid));
4046   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4047   KMP_ASSERT(root->r.r_active == FALSE);
4048 
4049   r = __kmp_reset_root(gtid, root);
4050   KC_TRACE(10,
4051            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4052   return r;
4053 }
4054 #endif
4055 
4056 #if KMP_DEBUG
4057 void __kmp_task_info() {
4058 
4059   kmp_int32 gtid = __kmp_entry_gtid();
4060   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4061   kmp_info_t *this_thr = __kmp_threads[gtid];
4062   kmp_team_t *steam = this_thr->th.th_serial_team;
4063   kmp_team_t *team = this_thr->th.th_team;
4064 
4065   __kmp_printf(
4066       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4067       "ptask=%p\n",
4068       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4069       team->t.t_implicit_task_taskdata[tid].td_parent);
4070 }
4071 #endif // KMP_DEBUG
4072 
4073 /* TODO optimize with one big memclr, take out what isn't needed, split
4074    responsibility to workers as much as possible, and delay initialization of
4075    features as much as possible  */
4076 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4077                                   int tid, int gtid) {
4078   /* this_thr->th.th_info.ds.ds_gtid is setup in
4079      kmp_allocate_thread/create_worker.
4080      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4081   kmp_info_t *master = team->t.t_threads[0];
4082   KMP_DEBUG_ASSERT(this_thr != NULL);
4083   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4084   KMP_DEBUG_ASSERT(team);
4085   KMP_DEBUG_ASSERT(team->t.t_threads);
4086   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4087   KMP_DEBUG_ASSERT(master);
4088   KMP_DEBUG_ASSERT(master->th.th_root);
4089 
4090   KMP_MB();
4091 
4092   TCW_SYNC_PTR(this_thr->th.th_team, team);
4093 
4094   this_thr->th.th_info.ds.ds_tid = tid;
4095   this_thr->th.th_set_nproc = 0;
4096   if (__kmp_tasking_mode != tskm_immediate_exec)
4097     // When tasking is possible, threads are not safe to reap until they are
4098     // done tasking; this will be set when tasking code is exited in wait
4099     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4100   else // no tasking --> always safe to reap
4101     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4102   this_thr->th.th_set_proc_bind = proc_bind_default;
4103 #if KMP_AFFINITY_SUPPORTED
4104   this_thr->th.th_new_place = this_thr->th.th_current_place;
4105 #endif
4106   this_thr->th.th_root = master->th.th_root;
4107 
4108   /* setup the thread's cache of the team structure */
4109   this_thr->th.th_team_nproc = team->t.t_nproc;
4110   this_thr->th.th_team_master = master;
4111   this_thr->th.th_team_serialized = team->t.t_serialized;
4112   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4113 
4114   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4115 
4116   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4117                 tid, gtid, this_thr, this_thr->th.th_current_task));
4118 
4119   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4120                            team, tid, TRUE);
4121 
4122   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4123                 tid, gtid, this_thr, this_thr->th.th_current_task));
4124   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4125   // __kmp_initialize_team()?
4126 
4127   /* TODO no worksharing in speculative threads */
4128   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4129 
4130   this_thr->th.th_local.this_construct = 0;
4131 
4132   if (!this_thr->th.th_pri_common) {
4133     this_thr->th.th_pri_common =
4134         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4135     if (__kmp_storage_map) {
4136       __kmp_print_storage_map_gtid(
4137           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4138           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4139     }
4140     this_thr->th.th_pri_head = NULL;
4141   }
4142 
4143   if (this_thr != master && // Master's CG root is initialized elsewhere
4144       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4145     // Make new thread's CG root same as master's
4146     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4147     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4148     if (tmp) {
4149       // worker changes CG, need to check if old CG should be freed
4150       int i = tmp->cg_nthreads--;
4151       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4152                      " on node %p of thread %p to %d\n",
4153                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4154       if (i == 1) {
4155         __kmp_free(tmp); // last thread left CG --> free it
4156       }
4157     }
4158     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4159     // Increment new thread's CG root's counter to add the new thread
4160     this_thr->th.th_cg_roots->cg_nthreads++;
4161     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4162                    " node %p of thread %p to %d\n",
4163                    this_thr, this_thr->th.th_cg_roots,
4164                    this_thr->th.th_cg_roots->cg_root,
4165                    this_thr->th.th_cg_roots->cg_nthreads));
4166     this_thr->th.th_current_task->td_icvs.thread_limit =
4167         this_thr->th.th_cg_roots->cg_thread_limit;
4168   }
4169 
4170   /* Initialize dynamic dispatch */
4171   {
4172     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4173     // Use team max_nproc since this will never change for the team.
4174     size_t disp_size =
4175         sizeof(dispatch_private_info_t) *
4176         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4177     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4178                   team->t.t_max_nproc));
4179     KMP_ASSERT(dispatch);
4180     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4181     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4182 
4183     dispatch->th_disp_index = 0;
4184     dispatch->th_doacross_buf_idx = 0;
4185     if (!dispatch->th_disp_buffer) {
4186       dispatch->th_disp_buffer =
4187           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4188 
4189       if (__kmp_storage_map) {
4190         __kmp_print_storage_map_gtid(
4191             gtid, &dispatch->th_disp_buffer[0],
4192             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4193                                           ? 1
4194                                           : __kmp_dispatch_num_buffers],
4195             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4196                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4197             gtid, team->t.t_id, gtid);
4198       }
4199     } else {
4200       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4201     }
4202 
4203     dispatch->th_dispatch_pr_current = 0;
4204     dispatch->th_dispatch_sh_current = 0;
4205 
4206     dispatch->th_deo_fcn = 0; /* ORDERED     */
4207     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4208   }
4209 
4210   this_thr->th.th_next_pool = NULL;
4211 
4212   if (!this_thr->th.th_task_state_memo_stack) {
4213     size_t i;
4214     this_thr->th.th_task_state_memo_stack =
4215         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4216     this_thr->th.th_task_state_top = 0;
4217     this_thr->th.th_task_state_stack_sz = 4;
4218     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4219          ++i) // zero init the stack
4220       this_thr->th.th_task_state_memo_stack[i] = 0;
4221   }
4222 
4223   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4224   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4225 
4226   KMP_MB();
4227 }
4228 
4229 /* allocate a new thread for the requesting team. this is only called from
4230    within a forkjoin critical section. we will first try to get an available
4231    thread from the thread pool. if none is available, we will fork a new one
4232    assuming we are able to create a new one. this should be assured, as the
4233    caller should check on this first. */
4234 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4235                                   int new_tid) {
4236   kmp_team_t *serial_team;
4237   kmp_info_t *new_thr;
4238   int new_gtid;
4239 
4240   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4241   KMP_DEBUG_ASSERT(root && team);
4242 #if !KMP_NESTED_HOT_TEAMS
4243   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4244 #endif
4245   KMP_MB();
4246 
4247   /* first, try to get one from the thread pool */
4248   if (__kmp_thread_pool) {
4249     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4250     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4251     if (new_thr == __kmp_thread_pool_insert_pt) {
4252       __kmp_thread_pool_insert_pt = NULL;
4253     }
4254     TCW_4(new_thr->th.th_in_pool, FALSE);
4255     __kmp_suspend_initialize_thread(new_thr);
4256     __kmp_lock_suspend_mx(new_thr);
4257     if (new_thr->th.th_active_in_pool == TRUE) {
4258       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4259       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4260       new_thr->th.th_active_in_pool = FALSE;
4261     }
4262     __kmp_unlock_suspend_mx(new_thr);
4263 
4264     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4265                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4266     KMP_ASSERT(!new_thr->th.th_team);
4267     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4268 
4269     /* setup the thread structure */
4270     __kmp_initialize_info(new_thr, team, new_tid,
4271                           new_thr->th.th_info.ds.ds_gtid);
4272     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4273 
4274     TCW_4(__kmp_nth, __kmp_nth + 1);
4275 
4276     new_thr->th.th_task_state = 0;
4277     new_thr->th.th_task_state_top = 0;
4278     new_thr->th.th_task_state_stack_sz = 4;
4279 
4280 #ifdef KMP_ADJUST_BLOCKTIME
4281     /* Adjust blocktime back to zero if necessary */
4282     /* Middle initialization might not have occurred yet */
4283     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4284       if (__kmp_nth > __kmp_avail_proc) {
4285         __kmp_zero_bt = TRUE;
4286       }
4287     }
4288 #endif /* KMP_ADJUST_BLOCKTIME */
4289 
4290 #if KMP_DEBUG
4291     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4292     // KMP_BARRIER_PARENT_FLAG.
4293     int b;
4294     kmp_balign_t *balign = new_thr->th.th_bar;
4295     for (b = 0; b < bs_last_barrier; ++b)
4296       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4297 #endif
4298 
4299     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4300                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4301 
4302     KMP_MB();
4303     return new_thr;
4304   }
4305 
4306   /* no, well fork a new one */
4307   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4308   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4309 
4310 #if KMP_USE_MONITOR
4311   // If this is the first worker thread the RTL is creating, then also
4312   // launch the monitor thread.  We try to do this as early as possible.
4313   if (!TCR_4(__kmp_init_monitor)) {
4314     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4315     if (!TCR_4(__kmp_init_monitor)) {
4316       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4317       TCW_4(__kmp_init_monitor, 1);
4318       __kmp_create_monitor(&__kmp_monitor);
4319       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4320 #if KMP_OS_WINDOWS
4321       // AC: wait until monitor has started. This is a fix for CQ232808.
4322       // The reason is that if the library is loaded/unloaded in a loop with
4323       // small (parallel) work in between, then there is high probability that
4324       // monitor thread started after the library shutdown. At shutdown it is
4325       // too late to cope with the problem, because when the master is in
4326       // DllMain (process detach) the monitor has no chances to start (it is
4327       // blocked), and master has no means to inform the monitor that the
4328       // library has gone, because all the memory which the monitor can access
4329       // is going to be released/reset.
4330       while (TCR_4(__kmp_init_monitor) < 2) {
4331         KMP_YIELD(TRUE);
4332       }
4333       KF_TRACE(10, ("after monitor thread has started\n"));
4334 #endif
4335     }
4336     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4337   }
4338 #endif
4339 
4340   KMP_MB();
4341 
4342   {
4343     int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4344                              ? 1
4345                              : __kmp_hidden_helper_threads_num + 1;
4346 
4347     for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4348          ++new_gtid) {
4349       KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4350     }
4351 
4352     if (TCR_4(__kmp_init_hidden_helper_threads)) {
4353       KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4354     }
4355   }
4356 
4357   /* allocate space for it. */
4358   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4359 
4360   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4361 
4362 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4363   // suppress race conditions detection on synchronization flags in debug mode
4364   // this helps to analyze library internals eliminating false positives
4365   __itt_suppress_mark_range(
4366       __itt_suppress_range, __itt_suppress_threading_errors,
4367       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4368   __itt_suppress_mark_range(
4369       __itt_suppress_range, __itt_suppress_threading_errors,
4370       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4371 #if KMP_OS_WINDOWS
4372   __itt_suppress_mark_range(
4373       __itt_suppress_range, __itt_suppress_threading_errors,
4374       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4375 #else
4376   __itt_suppress_mark_range(__itt_suppress_range,
4377                             __itt_suppress_threading_errors,
4378                             &new_thr->th.th_suspend_init_count,
4379                             sizeof(new_thr->th.th_suspend_init_count));
4380 #endif
4381   // TODO: check if we need to also suppress b_arrived flags
4382   __itt_suppress_mark_range(__itt_suppress_range,
4383                             __itt_suppress_threading_errors,
4384                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4385                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4386   __itt_suppress_mark_range(__itt_suppress_range,
4387                             __itt_suppress_threading_errors,
4388                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4389                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4390   __itt_suppress_mark_range(__itt_suppress_range,
4391                             __itt_suppress_threading_errors,
4392                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4393                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4394 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4395   if (__kmp_storage_map) {
4396     __kmp_print_thread_storage_map(new_thr, new_gtid);
4397   }
4398 
4399   // add the reserve serialized team, initialized from the team's master thread
4400   {
4401     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4402     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4403     new_thr->th.th_serial_team = serial_team =
4404         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4405 #if OMPT_SUPPORT
4406                                           ompt_data_none, // root parallel id
4407 #endif
4408                                           proc_bind_default, &r_icvs,
4409                                           0 USE_NESTED_HOT_ARG(NULL));
4410   }
4411   KMP_ASSERT(serial_team);
4412   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4413   // execution (it is unused for now).
4414   serial_team->t.t_threads[0] = new_thr;
4415   KF_TRACE(10,
4416            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4417             new_thr));
4418 
4419   /* setup the thread structures */
4420   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4421 
4422 #if USE_FAST_MEMORY
4423   __kmp_initialize_fast_memory(new_thr);
4424 #endif /* USE_FAST_MEMORY */
4425 
4426 #if KMP_USE_BGET
4427   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4428   __kmp_initialize_bget(new_thr);
4429 #endif
4430 
4431   __kmp_init_random(new_thr); // Initialize random number generator
4432 
4433   /* Initialize these only once when thread is grabbed for a team allocation */
4434   KA_TRACE(20,
4435            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4436             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4437 
4438   int b;
4439   kmp_balign_t *balign = new_thr->th.th_bar;
4440   for (b = 0; b < bs_last_barrier; ++b) {
4441     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4442     balign[b].bb.team = NULL;
4443     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4444     balign[b].bb.use_oncore_barrier = 0;
4445   }
4446 
4447   new_thr->th.th_spin_here = FALSE;
4448   new_thr->th.th_next_waiting = 0;
4449 #if KMP_OS_UNIX
4450   new_thr->th.th_blocking = false;
4451 #endif
4452 
4453 #if KMP_AFFINITY_SUPPORTED
4454   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4455   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4456   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4457   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4458 #endif
4459   new_thr->th.th_def_allocator = __kmp_def_allocator;
4460   new_thr->th.th_prev_level = 0;
4461   new_thr->th.th_prev_num_threads = 1;
4462 
4463   TCW_4(new_thr->th.th_in_pool, FALSE);
4464   new_thr->th.th_active_in_pool = FALSE;
4465   TCW_4(new_thr->th.th_active, TRUE);
4466 
4467   /* adjust the global counters */
4468   __kmp_all_nth++;
4469   __kmp_nth++;
4470 
4471   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4472   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4473   if (__kmp_adjust_gtid_mode) {
4474     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4475       if (TCR_4(__kmp_gtid_mode) != 2) {
4476         TCW_4(__kmp_gtid_mode, 2);
4477       }
4478     } else {
4479       if (TCR_4(__kmp_gtid_mode) != 1) {
4480         TCW_4(__kmp_gtid_mode, 1);
4481       }
4482     }
4483   }
4484 
4485 #ifdef KMP_ADJUST_BLOCKTIME
4486   /* Adjust blocktime back to zero if necessary       */
4487   /* Middle initialization might not have occurred yet */
4488   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4489     if (__kmp_nth > __kmp_avail_proc) {
4490       __kmp_zero_bt = TRUE;
4491     }
4492   }
4493 #endif /* KMP_ADJUST_BLOCKTIME */
4494 
4495   /* actually fork it and create the new worker thread */
4496   KF_TRACE(
4497       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4498   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4499   KF_TRACE(10,
4500            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4501 
4502   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4503                 new_gtid));
4504   KMP_MB();
4505   return new_thr;
4506 }
4507 
4508 /* Reinitialize team for reuse.
4509    The hot team code calls this case at every fork barrier, so EPCC barrier
4510    test are extremely sensitive to changes in it, esp. writes to the team
4511    struct, which cause a cache invalidation in all threads.
4512    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4513 static void __kmp_reinitialize_team(kmp_team_t *team,
4514                                     kmp_internal_control_t *new_icvs,
4515                                     ident_t *loc) {
4516   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4517                 team->t.t_threads[0], team));
4518   KMP_DEBUG_ASSERT(team && new_icvs);
4519   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4520   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4521 
4522   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4523   // Copy ICVs to the master thread's implicit taskdata
4524   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4525   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4526 
4527   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4528                 team->t.t_threads[0], team));
4529 }
4530 
4531 /* Initialize the team data structure.
4532    This assumes the t_threads and t_max_nproc are already set.
4533    Also, we don't touch the arguments */
4534 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4535                                   kmp_internal_control_t *new_icvs,
4536                                   ident_t *loc) {
4537   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4538 
4539   /* verify */
4540   KMP_DEBUG_ASSERT(team);
4541   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4542   KMP_DEBUG_ASSERT(team->t.t_threads);
4543   KMP_MB();
4544 
4545   team->t.t_master_tid = 0; /* not needed */
4546   /* team->t.t_master_bar;        not needed */
4547   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4548   team->t.t_nproc = new_nproc;
4549 
4550   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4551   team->t.t_next_pool = NULL;
4552   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4553    * up hot team */
4554 
4555   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4556   team->t.t_invoke = NULL; /* not needed */
4557 
4558   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4559   team->t.t_sched.sched = new_icvs->sched.sched;
4560 
4561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4562   team->t.t_fp_control_saved = FALSE; /* not needed */
4563   team->t.t_x87_fpu_control_word = 0; /* not needed */
4564   team->t.t_mxcsr = 0; /* not needed */
4565 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4566 
4567   team->t.t_construct = 0;
4568 
4569   team->t.t_ordered.dt.t_value = 0;
4570   team->t.t_master_active = FALSE;
4571 
4572 #ifdef KMP_DEBUG
4573   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4574 #endif
4575 #if KMP_OS_WINDOWS
4576   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4577 #endif
4578 
4579   team->t.t_control_stack_top = NULL;
4580 
4581   __kmp_reinitialize_team(team, new_icvs, loc);
4582 
4583   KMP_MB();
4584   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4585 }
4586 
4587 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4588 /* Sets full mask for thread and returns old mask, no changes to structures. */
4589 static void
4590 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4591   if (KMP_AFFINITY_CAPABLE()) {
4592     int status;
4593     if (old_mask != NULL) {
4594       status = __kmp_get_system_affinity(old_mask, TRUE);
4595       int error = errno;
4596       if (status != 0) {
4597         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4598                     __kmp_msg_null);
4599       }
4600     }
4601     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4602   }
4603 }
4604 #endif
4605 
4606 #if KMP_AFFINITY_SUPPORTED
4607 
4608 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4609 // It calculates the worker + master thread's partition based upon the parent
4610 // thread's partition, and binds each worker to a thread in their partition.
4611 // The master thread's partition should already include its current binding.
4612 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4613   // Copy the master thread's place partition to the team struct
4614   kmp_info_t *master_th = team->t.t_threads[0];
4615   KMP_DEBUG_ASSERT(master_th != NULL);
4616   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4617   int first_place = master_th->th.th_first_place;
4618   int last_place = master_th->th.th_last_place;
4619   int masters_place = master_th->th.th_current_place;
4620   team->t.t_first_place = first_place;
4621   team->t.t_last_place = last_place;
4622 
4623   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4624                 "bound to place %d partition = [%d,%d]\n",
4625                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4626                 team->t.t_id, masters_place, first_place, last_place));
4627 
4628   switch (proc_bind) {
4629 
4630   case proc_bind_default:
4631     // serial teams might have the proc_bind policy set to proc_bind_default. It
4632     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4633     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4634     break;
4635 
4636   case proc_bind_master: {
4637     int f;
4638     int n_th = team->t.t_nproc;
4639     for (f = 1; f < n_th; f++) {
4640       kmp_info_t *th = team->t.t_threads[f];
4641       KMP_DEBUG_ASSERT(th != NULL);
4642       th->th.th_first_place = first_place;
4643       th->th.th_last_place = last_place;
4644       th->th.th_new_place = masters_place;
4645       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4646           team->t.t_display_affinity != 1) {
4647         team->t.t_display_affinity = 1;
4648       }
4649 
4650       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4651                      "partition = [%d,%d]\n",
4652                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4653                      f, masters_place, first_place, last_place));
4654     }
4655   } break;
4656 
4657   case proc_bind_close: {
4658     int f;
4659     int n_th = team->t.t_nproc;
4660     int n_places;
4661     if (first_place <= last_place) {
4662       n_places = last_place - first_place + 1;
4663     } else {
4664       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4665     }
4666     if (n_th <= n_places) {
4667       int place = masters_place;
4668       for (f = 1; f < n_th; f++) {
4669         kmp_info_t *th = team->t.t_threads[f];
4670         KMP_DEBUG_ASSERT(th != NULL);
4671 
4672         if (place == last_place) {
4673           place = first_place;
4674         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675           place = 0;
4676         } else {
4677           place++;
4678         }
4679         th->th.th_first_place = first_place;
4680         th->th.th_last_place = last_place;
4681         th->th.th_new_place = place;
4682         if (__kmp_display_affinity && place != th->th.th_current_place &&
4683             team->t.t_display_affinity != 1) {
4684           team->t.t_display_affinity = 1;
4685         }
4686 
4687         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4688                        "partition = [%d,%d]\n",
4689                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4690                        team->t.t_id, f, place, first_place, last_place));
4691       }
4692     } else {
4693       int S, rem, gap, s_count;
4694       S = n_th / n_places;
4695       s_count = 0;
4696       rem = n_th - (S * n_places);
4697       gap = rem > 0 ? n_places / rem : n_places;
4698       int place = masters_place;
4699       int gap_ct = gap;
4700       for (f = 0; f < n_th; f++) {
4701         kmp_info_t *th = team->t.t_threads[f];
4702         KMP_DEBUG_ASSERT(th != NULL);
4703 
4704         th->th.th_first_place = first_place;
4705         th->th.th_last_place = last_place;
4706         th->th.th_new_place = place;
4707         if (__kmp_display_affinity && place != th->th.th_current_place &&
4708             team->t.t_display_affinity != 1) {
4709           team->t.t_display_affinity = 1;
4710         }
4711         s_count++;
4712 
4713         if ((s_count == S) && rem && (gap_ct == gap)) {
4714           // do nothing, add an extra thread to place on next iteration
4715         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4716           // we added an extra thread to this place; move to next place
4717           if (place == last_place) {
4718             place = first_place;
4719           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4720             place = 0;
4721           } else {
4722             place++;
4723           }
4724           s_count = 0;
4725           gap_ct = 1;
4726           rem--;
4727         } else if (s_count == S) { // place full; don't add extra
4728           if (place == last_place) {
4729             place = first_place;
4730           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4731             place = 0;
4732           } else {
4733             place++;
4734           }
4735           gap_ct++;
4736           s_count = 0;
4737         }
4738 
4739         KA_TRACE(100,
4740                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4741                   "partition = [%d,%d]\n",
4742                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4743                   th->th.th_new_place, first_place, last_place));
4744       }
4745       KMP_DEBUG_ASSERT(place == masters_place);
4746     }
4747   } break;
4748 
4749   case proc_bind_spread: {
4750     int f;
4751     int n_th = team->t.t_nproc;
4752     int n_places;
4753     int thidx;
4754     if (first_place <= last_place) {
4755       n_places = last_place - first_place + 1;
4756     } else {
4757       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4758     }
4759     if (n_th <= n_places) {
4760       int place = -1;
4761 
4762       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4763         int S = n_places / n_th;
4764         int s_count, rem, gap, gap_ct;
4765 
4766         place = masters_place;
4767         rem = n_places - n_th * S;
4768         gap = rem ? n_th / rem : 1;
4769         gap_ct = gap;
4770         thidx = n_th;
4771         if (update_master_only == 1)
4772           thidx = 1;
4773         for (f = 0; f < thidx; f++) {
4774           kmp_info_t *th = team->t.t_threads[f];
4775           KMP_DEBUG_ASSERT(th != NULL);
4776 
4777           th->th.th_first_place = place;
4778           th->th.th_new_place = place;
4779           if (__kmp_display_affinity && place != th->th.th_current_place &&
4780               team->t.t_display_affinity != 1) {
4781             team->t.t_display_affinity = 1;
4782           }
4783           s_count = 1;
4784           while (s_count < S) {
4785             if (place == last_place) {
4786               place = first_place;
4787             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4788               place = 0;
4789             } else {
4790               place++;
4791             }
4792             s_count++;
4793           }
4794           if (rem && (gap_ct == gap)) {
4795             if (place == last_place) {
4796               place = first_place;
4797             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798               place = 0;
4799             } else {
4800               place++;
4801             }
4802             rem--;
4803             gap_ct = 0;
4804           }
4805           th->th.th_last_place = place;
4806           gap_ct++;
4807 
4808           if (place == last_place) {
4809             place = first_place;
4810           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4811             place = 0;
4812           } else {
4813             place++;
4814           }
4815 
4816           KA_TRACE(100,
4817                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4818                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4819                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4820                     f, th->th.th_new_place, th->th.th_first_place,
4821                     th->th.th_last_place, __kmp_affinity_num_masks));
4822         }
4823       } else {
4824         /* Having uniform space of available computation places I can create
4825            T partitions of round(P/T) size and put threads into the first
4826            place of each partition. */
4827         double current = static_cast<double>(masters_place);
4828         double spacing =
4829             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4830         int first, last;
4831         kmp_info_t *th;
4832 
4833         thidx = n_th + 1;
4834         if (update_master_only == 1)
4835           thidx = 1;
4836         for (f = 0; f < thidx; f++) {
4837           first = static_cast<int>(current);
4838           last = static_cast<int>(current + spacing) - 1;
4839           KMP_DEBUG_ASSERT(last >= first);
4840           if (first >= n_places) {
4841             if (masters_place) {
4842               first -= n_places;
4843               last -= n_places;
4844               if (first == (masters_place + 1)) {
4845                 KMP_DEBUG_ASSERT(f == n_th);
4846                 first--;
4847               }
4848               if (last == masters_place) {
4849                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4850                 last--;
4851               }
4852             } else {
4853               KMP_DEBUG_ASSERT(f == n_th);
4854               first = 0;
4855               last = 0;
4856             }
4857           }
4858           if (last >= n_places) {
4859             last = (n_places - 1);
4860           }
4861           place = first;
4862           current += spacing;
4863           if (f < n_th) {
4864             KMP_DEBUG_ASSERT(0 <= first);
4865             KMP_DEBUG_ASSERT(n_places > first);
4866             KMP_DEBUG_ASSERT(0 <= last);
4867             KMP_DEBUG_ASSERT(n_places > last);
4868             KMP_DEBUG_ASSERT(last_place >= first_place);
4869             th = team->t.t_threads[f];
4870             KMP_DEBUG_ASSERT(th);
4871             th->th.th_first_place = first;
4872             th->th.th_new_place = place;
4873             th->th.th_last_place = last;
4874             if (__kmp_display_affinity && place != th->th.th_current_place &&
4875                 team->t.t_display_affinity != 1) {
4876               team->t.t_display_affinity = 1;
4877             }
4878             KA_TRACE(100,
4879                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4880                       "partition = [%d,%d], spacing = %.4f\n",
4881                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4882                       team->t.t_id, f, th->th.th_new_place,
4883                       th->th.th_first_place, th->th.th_last_place, spacing));
4884           }
4885         }
4886       }
4887       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4888     } else {
4889       int S, rem, gap, s_count;
4890       S = n_th / n_places;
4891       s_count = 0;
4892       rem = n_th - (S * n_places);
4893       gap = rem > 0 ? n_places / rem : n_places;
4894       int place = masters_place;
4895       int gap_ct = gap;
4896       thidx = n_th;
4897       if (update_master_only == 1)
4898         thidx = 1;
4899       for (f = 0; f < thidx; f++) {
4900         kmp_info_t *th = team->t.t_threads[f];
4901         KMP_DEBUG_ASSERT(th != NULL);
4902 
4903         th->th.th_first_place = place;
4904         th->th.th_last_place = place;
4905         th->th.th_new_place = place;
4906         if (__kmp_display_affinity && place != th->th.th_current_place &&
4907             team->t.t_display_affinity != 1) {
4908           team->t.t_display_affinity = 1;
4909         }
4910         s_count++;
4911 
4912         if ((s_count == S) && rem && (gap_ct == gap)) {
4913           // do nothing, add an extra thread to place on next iteration
4914         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4915           // we added an extra thread to this place; move on to next place
4916           if (place == last_place) {
4917             place = first_place;
4918           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4919             place = 0;
4920           } else {
4921             place++;
4922           }
4923           s_count = 0;
4924           gap_ct = 1;
4925           rem--;
4926         } else if (s_count == S) { // place is full; don't add extra thread
4927           if (place == last_place) {
4928             place = first_place;
4929           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4930             place = 0;
4931           } else {
4932             place++;
4933           }
4934           gap_ct++;
4935           s_count = 0;
4936         }
4937 
4938         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4939                        "partition = [%d,%d]\n",
4940                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4941                        team->t.t_id, f, th->th.th_new_place,
4942                        th->th.th_first_place, th->th.th_last_place));
4943       }
4944       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4945     }
4946   } break;
4947 
4948   default:
4949     break;
4950   }
4951 
4952   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4953 }
4954 
4955 #endif // KMP_AFFINITY_SUPPORTED
4956 
4957 /* allocate a new team data structure to use.  take one off of the free pool if
4958    available */
4959 kmp_team_t *
4960 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4961 #if OMPT_SUPPORT
4962                     ompt_data_t ompt_parallel_data,
4963 #endif
4964                     kmp_proc_bind_t new_proc_bind,
4965                     kmp_internal_control_t *new_icvs,
4966                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4967   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4968   int f;
4969   kmp_team_t *team;
4970   int use_hot_team = !root->r.r_active;
4971   int level = 0;
4972 
4973   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4974   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4975   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4976   KMP_MB();
4977 
4978 #if KMP_NESTED_HOT_TEAMS
4979   kmp_hot_team_ptr_t *hot_teams;
4980   if (master) {
4981     team = master->th.th_team;
4982     level = team->t.t_active_level;
4983     if (master->th.th_teams_microtask) { // in teams construct?
4984       if (master->th.th_teams_size.nteams > 1 &&
4985           ( // #teams > 1
4986               team->t.t_pkfn ==
4987                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4988               master->th.th_teams_level <
4989                   team->t.t_level)) { // or nested parallel inside the teams
4990         ++level; // not increment if #teams==1, or for outer fork of the teams;
4991         // increment otherwise
4992       }
4993     }
4994     hot_teams = master->th.th_hot_teams;
4995     if (level < __kmp_hot_teams_max_level && hot_teams &&
4996         hot_teams[level].hot_team) {
4997       // hot team has already been allocated for given level
4998       use_hot_team = 1;
4999     } else {
5000       use_hot_team = 0;
5001     }
5002   } else {
5003     // check we won't access uninitialized hot_teams, just in case
5004     KMP_DEBUG_ASSERT(new_nproc == 1);
5005   }
5006 #endif
5007   // Optimization to use a "hot" team
5008   if (use_hot_team && new_nproc > 1) {
5009     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5010 #if KMP_NESTED_HOT_TEAMS
5011     team = hot_teams[level].hot_team;
5012 #else
5013     team = root->r.r_hot_team;
5014 #endif
5015 #if KMP_DEBUG
5016     if (__kmp_tasking_mode != tskm_immediate_exec) {
5017       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5018                     "task_team[1] = %p before reinit\n",
5019                     team->t.t_task_team[0], team->t.t_task_team[1]));
5020     }
5021 #endif
5022 
5023     // Has the number of threads changed?
5024     /* Let's assume the most common case is that the number of threads is
5025        unchanged, and put that case first. */
5026     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5027       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5028       // This case can mean that omp_set_num_threads() was called and the hot
5029       // team size was already reduced, so we check the special flag
5030       if (team->t.t_size_changed == -1) {
5031         team->t.t_size_changed = 1;
5032       } else {
5033         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5034       }
5035 
5036       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5037       kmp_r_sched_t new_sched = new_icvs->sched;
5038       // set master's schedule as new run-time schedule
5039       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5040 
5041       __kmp_reinitialize_team(team, new_icvs,
5042                               root->r.r_uber_thread->th.th_ident);
5043 
5044       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5045                     team->t.t_threads[0], team));
5046       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5047 
5048 #if KMP_AFFINITY_SUPPORTED
5049       if ((team->t.t_size_changed == 0) &&
5050           (team->t.t_proc_bind == new_proc_bind)) {
5051         if (new_proc_bind == proc_bind_spread) {
5052           __kmp_partition_places(
5053               team, 1); // add flag to update only master for spread
5054         }
5055         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5056                        "proc_bind = %d, partition = [%d,%d]\n",
5057                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5058                        team->t.t_last_place));
5059       } else {
5060         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5061         __kmp_partition_places(team);
5062       }
5063 #else
5064       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5065 #endif /* KMP_AFFINITY_SUPPORTED */
5066     } else if (team->t.t_nproc > new_nproc) {
5067       KA_TRACE(20,
5068                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5069                 new_nproc));
5070 
5071       team->t.t_size_changed = 1;
5072 #if KMP_NESTED_HOT_TEAMS
5073       if (__kmp_hot_teams_mode == 0) {
5074         // AC: saved number of threads should correspond to team's value in this
5075         // mode, can be bigger in mode 1, when hot team has threads in reserve
5076         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5077         hot_teams[level].hot_team_nth = new_nproc;
5078 #endif // KMP_NESTED_HOT_TEAMS
5079         /* release the extra threads we don't need any more */
5080         for (f = new_nproc; f < team->t.t_nproc; f++) {
5081           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5082           if (__kmp_tasking_mode != tskm_immediate_exec) {
5083             // When decreasing team size, threads no longer in the team should
5084             // unref task team.
5085             team->t.t_threads[f]->th.th_task_team = NULL;
5086           }
5087           __kmp_free_thread(team->t.t_threads[f]);
5088           team->t.t_threads[f] = NULL;
5089         }
5090 #if KMP_NESTED_HOT_TEAMS
5091       } // (__kmp_hot_teams_mode == 0)
5092       else {
5093         // When keeping extra threads in team, switch threads to wait on own
5094         // b_go flag
5095         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5096           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5097           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5098           for (int b = 0; b < bs_last_barrier; ++b) {
5099             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5100               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5101             }
5102             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5103           }
5104         }
5105       }
5106 #endif // KMP_NESTED_HOT_TEAMS
5107       team->t.t_nproc = new_nproc;
5108       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5109       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5110       __kmp_reinitialize_team(team, new_icvs,
5111                               root->r.r_uber_thread->th.th_ident);
5112 
5113       // Update remaining threads
5114       for (f = 0; f < new_nproc; ++f) {
5115         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5116       }
5117 
5118       // restore the current task state of the master thread: should be the
5119       // implicit task
5120       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5121                     team->t.t_threads[0], team));
5122 
5123       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5124 
5125 #ifdef KMP_DEBUG
5126       for (f = 0; f < team->t.t_nproc; f++) {
5127         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5128                          team->t.t_threads[f]->th.th_team_nproc ==
5129                              team->t.t_nproc);
5130       }
5131 #endif
5132 
5133       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5134 #if KMP_AFFINITY_SUPPORTED
5135       __kmp_partition_places(team);
5136 #endif
5137     } else { // team->t.t_nproc < new_nproc
5138 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5139       kmp_affin_mask_t *old_mask;
5140       if (KMP_AFFINITY_CAPABLE()) {
5141         KMP_CPU_ALLOC(old_mask);
5142       }
5143 #endif
5144 
5145       KA_TRACE(20,
5146                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5147                 new_nproc));
5148 
5149       team->t.t_size_changed = 1;
5150 
5151 #if KMP_NESTED_HOT_TEAMS
5152       int avail_threads = hot_teams[level].hot_team_nth;
5153       if (new_nproc < avail_threads)
5154         avail_threads = new_nproc;
5155       kmp_info_t **other_threads = team->t.t_threads;
5156       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5157         // Adjust barrier data of reserved threads (if any) of the team
5158         // Other data will be set in __kmp_initialize_info() below.
5159         int b;
5160         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5161         for (b = 0; b < bs_last_barrier; ++b) {
5162           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5163           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5164 #if USE_DEBUGGER
5165           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5166 #endif
5167         }
5168       }
5169       if (hot_teams[level].hot_team_nth >= new_nproc) {
5170         // we have all needed threads in reserve, no need to allocate any
5171         // this only possible in mode 1, cannot have reserved threads in mode 0
5172         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5173         team->t.t_nproc = new_nproc; // just get reserved threads involved
5174       } else {
5175         // we may have some threads in reserve, but not enough
5176         team->t.t_nproc =
5177             hot_teams[level]
5178                 .hot_team_nth; // get reserved threads involved if any
5179         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5180 #endif // KMP_NESTED_HOT_TEAMS
5181         if (team->t.t_max_nproc < new_nproc) {
5182           /* reallocate larger arrays */
5183           __kmp_reallocate_team_arrays(team, new_nproc);
5184           __kmp_reinitialize_team(team, new_icvs, NULL);
5185         }
5186 
5187 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5188         /* Temporarily set full mask for master thread before creation of
5189            workers. The reason is that workers inherit the affinity from master,
5190            so if a lot of workers are created on the single core quickly, they
5191            don't get a chance to set their own affinity for a long time. */
5192         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5193 #endif
5194 
5195         /* allocate new threads for the hot team */
5196         for (f = team->t.t_nproc; f < new_nproc; f++) {
5197           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5198           KMP_DEBUG_ASSERT(new_worker);
5199           team->t.t_threads[f] = new_worker;
5200 
5201           KA_TRACE(20,
5202                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5203                     "join=%llu, plain=%llu\n",
5204                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5205                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5206                     team->t.t_bar[bs_plain_barrier].b_arrived));
5207 
5208           { // Initialize barrier data for new threads.
5209             int b;
5210             kmp_balign_t *balign = new_worker->th.th_bar;
5211             for (b = 0; b < bs_last_barrier; ++b) {
5212               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5213               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5214                                KMP_BARRIER_PARENT_FLAG);
5215 #if USE_DEBUGGER
5216               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5217 #endif
5218             }
5219           }
5220         }
5221 
5222 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5223         if (KMP_AFFINITY_CAPABLE()) {
5224           /* Restore initial master thread's affinity mask */
5225           __kmp_set_system_affinity(old_mask, TRUE);
5226           KMP_CPU_FREE(old_mask);
5227         }
5228 #endif
5229 #if KMP_NESTED_HOT_TEAMS
5230       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5231 #endif // KMP_NESTED_HOT_TEAMS
5232       /* make sure everyone is syncronized */
5233       int old_nproc = team->t.t_nproc; // save old value and use to update only
5234       // new threads below
5235       __kmp_initialize_team(team, new_nproc, new_icvs,
5236                             root->r.r_uber_thread->th.th_ident);
5237 
5238       /* reinitialize the threads */
5239       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5240       for (f = 0; f < team->t.t_nproc; ++f)
5241         __kmp_initialize_info(team->t.t_threads[f], team, f,
5242                               __kmp_gtid_from_tid(f, team));
5243 
5244       if (level) { // set th_task_state for new threads in nested hot team
5245         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5246         // only need to set the th_task_state for the new threads. th_task_state
5247         // for master thread will not be accurate until after this in
5248         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5249         // correct value.
5250         for (f = old_nproc; f < team->t.t_nproc; ++f)
5251           team->t.t_threads[f]->th.th_task_state =
5252               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5253       } else { // set th_task_state for new threads in non-nested hot team
5254         kmp_uint8 old_state =
5255             team->t.t_threads[0]->th.th_task_state; // copy master's state
5256         for (f = old_nproc; f < team->t.t_nproc; ++f)
5257           team->t.t_threads[f]->th.th_task_state = old_state;
5258       }
5259 
5260 #ifdef KMP_DEBUG
5261       for (f = 0; f < team->t.t_nproc; ++f) {
5262         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5263                          team->t.t_threads[f]->th.th_team_nproc ==
5264                              team->t.t_nproc);
5265       }
5266 #endif
5267 
5268       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5269 #if KMP_AFFINITY_SUPPORTED
5270       __kmp_partition_places(team);
5271 #endif
5272     } // Check changes in number of threads
5273 
5274     kmp_info_t *master = team->t.t_threads[0];
5275     if (master->th.th_teams_microtask) {
5276       for (f = 1; f < new_nproc; ++f) {
5277         // propagate teams construct specific info to workers
5278         kmp_info_t *thr = team->t.t_threads[f];
5279         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5280         thr->th.th_teams_level = master->th.th_teams_level;
5281         thr->th.th_teams_size = master->th.th_teams_size;
5282       }
5283     }
5284 #if KMP_NESTED_HOT_TEAMS
5285     if (level) {
5286       // Sync barrier state for nested hot teams, not needed for outermost hot
5287       // team.
5288       for (f = 1; f < new_nproc; ++f) {
5289         kmp_info_t *thr = team->t.t_threads[f];
5290         int b;
5291         kmp_balign_t *balign = thr->th.th_bar;
5292         for (b = 0; b < bs_last_barrier; ++b) {
5293           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5294           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5295 #if USE_DEBUGGER
5296           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5297 #endif
5298         }
5299       }
5300     }
5301 #endif // KMP_NESTED_HOT_TEAMS
5302 
5303     /* reallocate space for arguments if necessary */
5304     __kmp_alloc_argv_entries(argc, team, TRUE);
5305     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5306     // The hot team re-uses the previous task team,
5307     // if untouched during the previous release->gather phase.
5308 
5309     KF_TRACE(10, (" hot_team = %p\n", team));
5310 
5311 #if KMP_DEBUG
5312     if (__kmp_tasking_mode != tskm_immediate_exec) {
5313       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5314                     "task_team[1] = %p after reinit\n",
5315                     team->t.t_task_team[0], team->t.t_task_team[1]));
5316     }
5317 #endif
5318 
5319 #if OMPT_SUPPORT
5320     __ompt_team_assign_id(team, ompt_parallel_data);
5321 #endif
5322 
5323     KMP_MB();
5324 
5325     return team;
5326   }
5327 
5328   /* next, let's try to take one from the team pool */
5329   KMP_MB();
5330   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5331     /* TODO: consider resizing undersized teams instead of reaping them, now
5332        that we have a resizing mechanism */
5333     if (team->t.t_max_nproc >= max_nproc) {
5334       /* take this team from the team pool */
5335       __kmp_team_pool = team->t.t_next_pool;
5336 
5337       /* setup the team for fresh use */
5338       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5339 
5340       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5341                     "task_team[1] %p to NULL\n",
5342                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5343       team->t.t_task_team[0] = NULL;
5344       team->t.t_task_team[1] = NULL;
5345 
5346       /* reallocate space for arguments if necessary */
5347       __kmp_alloc_argv_entries(argc, team, TRUE);
5348       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5349 
5350       KA_TRACE(
5351           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5352                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5353       { // Initialize barrier data.
5354         int b;
5355         for (b = 0; b < bs_last_barrier; ++b) {
5356           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5357 #if USE_DEBUGGER
5358           team->t.t_bar[b].b_master_arrived = 0;
5359           team->t.t_bar[b].b_team_arrived = 0;
5360 #endif
5361         }
5362       }
5363 
5364       team->t.t_proc_bind = new_proc_bind;
5365 
5366       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5367                     team->t.t_id));
5368 
5369 #if OMPT_SUPPORT
5370       __ompt_team_assign_id(team, ompt_parallel_data);
5371 #endif
5372 
5373       KMP_MB();
5374 
5375       return team;
5376     }
5377 
5378     /* reap team if it is too small, then loop back and check the next one */
5379     // not sure if this is wise, but, will be redone during the hot-teams
5380     // rewrite.
5381     /* TODO: Use technique to find the right size hot-team, don't reap them */
5382     team = __kmp_reap_team(team);
5383     __kmp_team_pool = team;
5384   }
5385 
5386   /* nothing available in the pool, no matter, make a new team! */
5387   KMP_MB();
5388   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5389 
5390   /* and set it up */
5391   team->t.t_max_nproc = max_nproc;
5392   /* NOTE well, for some reason allocating one big buffer and dividing it up
5393      seems to really hurt performance a lot on the P4, so, let's not use this */
5394   __kmp_allocate_team_arrays(team, max_nproc);
5395 
5396   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5397   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5398 
5399   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5400                 "%p to NULL\n",
5401                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5402   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5403   // memory, no need to duplicate
5404   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5405   // memory, no need to duplicate
5406 
5407   if (__kmp_storage_map) {
5408     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5409   }
5410 
5411   /* allocate space for arguments */
5412   __kmp_alloc_argv_entries(argc, team, FALSE);
5413   team->t.t_argc = argc;
5414 
5415   KA_TRACE(20,
5416            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5417             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5418   { // Initialize barrier data.
5419     int b;
5420     for (b = 0; b < bs_last_barrier; ++b) {
5421       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5422 #if USE_DEBUGGER
5423       team->t.t_bar[b].b_master_arrived = 0;
5424       team->t.t_bar[b].b_team_arrived = 0;
5425 #endif
5426     }
5427   }
5428 
5429   team->t.t_proc_bind = new_proc_bind;
5430 
5431 #if OMPT_SUPPORT
5432   __ompt_team_assign_id(team, ompt_parallel_data);
5433   team->t.ompt_serialized_team_info = NULL;
5434 #endif
5435 
5436   KMP_MB();
5437 
5438   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5439                 team->t.t_id));
5440 
5441   return team;
5442 }
5443 
5444 /* TODO implement hot-teams at all levels */
5445 /* TODO implement lazy thread release on demand (disband request) */
5446 
5447 /* free the team.  return it to the team pool.  release all the threads
5448  * associated with it */
5449 void __kmp_free_team(kmp_root_t *root,
5450                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5451   int f;
5452   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5453                 team->t.t_id));
5454 
5455   /* verify state */
5456   KMP_DEBUG_ASSERT(root);
5457   KMP_DEBUG_ASSERT(team);
5458   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5459   KMP_DEBUG_ASSERT(team->t.t_threads);
5460 
5461   int use_hot_team = team == root->r.r_hot_team;
5462 #if KMP_NESTED_HOT_TEAMS
5463   int level;
5464   kmp_hot_team_ptr_t *hot_teams;
5465   if (master) {
5466     level = team->t.t_active_level - 1;
5467     if (master->th.th_teams_microtask) { // in teams construct?
5468       if (master->th.th_teams_size.nteams > 1) {
5469         ++level; // level was not increased in teams construct for
5470         // team_of_masters
5471       }
5472       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5473           master->th.th_teams_level == team->t.t_level) {
5474         ++level; // level was not increased in teams construct for
5475         // team_of_workers before the parallel
5476       } // team->t.t_level will be increased inside parallel
5477     }
5478     hot_teams = master->th.th_hot_teams;
5479     if (level < __kmp_hot_teams_max_level) {
5480       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5481       use_hot_team = 1;
5482     }
5483   }
5484 #endif // KMP_NESTED_HOT_TEAMS
5485 
5486   /* team is done working */
5487   TCW_SYNC_PTR(team->t.t_pkfn,
5488                NULL); // Important for Debugging Support Library.
5489 #if KMP_OS_WINDOWS
5490   team->t.t_copyin_counter = 0; // init counter for possible reuse
5491 #endif
5492   // Do not reset pointer to parent team to NULL for hot teams.
5493 
5494   /* if we are non-hot team, release our threads */
5495   if (!use_hot_team) {
5496     if (__kmp_tasking_mode != tskm_immediate_exec) {
5497       // Wait for threads to reach reapable state
5498       for (f = 1; f < team->t.t_nproc; ++f) {
5499         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5500         kmp_info_t *th = team->t.t_threads[f];
5501         volatile kmp_uint32 *state = &th->th.th_reap_state;
5502         while (*state != KMP_SAFE_TO_REAP) {
5503 #if KMP_OS_WINDOWS
5504           // On Windows a thread can be killed at any time, check this
5505           DWORD ecode;
5506           if (!__kmp_is_thread_alive(th, &ecode)) {
5507             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5508             break;
5509           }
5510 #endif
5511           // first check if thread is sleeping
5512           kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5513           if (fl.is_sleeping())
5514             fl.resume(__kmp_gtid_from_thread(th));
5515           KMP_CPU_PAUSE();
5516         }
5517       }
5518 
5519       // Delete task teams
5520       int tt_idx;
5521       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5522         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5523         if (task_team != NULL) {
5524           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5525             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5526             team->t.t_threads[f]->th.th_task_team = NULL;
5527           }
5528           KA_TRACE(
5529               20,
5530               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5531                __kmp_get_gtid(), task_team, team->t.t_id));
5532 #if KMP_NESTED_HOT_TEAMS
5533           __kmp_free_task_team(master, task_team);
5534 #endif
5535           team->t.t_task_team[tt_idx] = NULL;
5536         }
5537       }
5538     }
5539 
5540     // Reset pointer to parent team only for non-hot teams.
5541     team->t.t_parent = NULL;
5542     team->t.t_level = 0;
5543     team->t.t_active_level = 0;
5544 
5545     /* free the worker threads */
5546     for (f = 1; f < team->t.t_nproc; ++f) {
5547       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5548       __kmp_free_thread(team->t.t_threads[f]);
5549       team->t.t_threads[f] = NULL;
5550     }
5551 
5552     /* put the team back in the team pool */
5553     /* TODO limit size of team pool, call reap_team if pool too large */
5554     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5555     __kmp_team_pool = (volatile kmp_team_t *)team;
5556   } else { // Check if team was created for the masters in a teams construct
5557     // See if first worker is a CG root
5558     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5559                      team->t.t_threads[1]->th.th_cg_roots);
5560     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5561       // Clean up the CG root nodes on workers so that this team can be re-used
5562       for (f = 1; f < team->t.t_nproc; ++f) {
5563         kmp_info_t *thr = team->t.t_threads[f];
5564         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5565                          thr->th.th_cg_roots->cg_root == thr);
5566         // Pop current CG root off list
5567         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5568         thr->th.th_cg_roots = tmp->up;
5569         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5570                        " up to node %p. cg_nthreads was %d\n",
5571                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5572         int i = tmp->cg_nthreads--;
5573         if (i == 1) {
5574           __kmp_free(tmp); // free CG if we are the last thread in it
5575         }
5576         // Restore current task's thread_limit from CG root
5577         if (thr->th.th_cg_roots)
5578           thr->th.th_current_task->td_icvs.thread_limit =
5579               thr->th.th_cg_roots->cg_thread_limit;
5580       }
5581     }
5582   }
5583 
5584   KMP_MB();
5585 }
5586 
5587 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5588 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5589   kmp_team_t *next_pool = team->t.t_next_pool;
5590 
5591   KMP_DEBUG_ASSERT(team);
5592   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5593   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5594   KMP_DEBUG_ASSERT(team->t.t_threads);
5595   KMP_DEBUG_ASSERT(team->t.t_argv);
5596 
5597   /* TODO clean the threads that are a part of this? */
5598 
5599   /* free stuff */
5600   __kmp_free_team_arrays(team);
5601   if (team->t.t_argv != &team->t.t_inline_argv[0])
5602     __kmp_free((void *)team->t.t_argv);
5603   __kmp_free(team);
5604 
5605   KMP_MB();
5606   return next_pool;
5607 }
5608 
5609 // Free the thread.  Don't reap it, just place it on the pool of available
5610 // threads.
5611 //
5612 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5613 // binding for the affinity mechanism to be useful.
5614 //
5615 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5616 // However, we want to avoid a potential performance problem by always
5617 // scanning through the list to find the correct point at which to insert
5618 // the thread (potential N**2 behavior).  To do this we keep track of the
5619 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5620 // With single-level parallelism, threads will always be added to the tail
5621 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5622 // parallelism, all bets are off and we may need to scan through the entire
5623 // free list.
5624 //
5625 // This change also has a potentially large performance benefit, for some
5626 // applications.  Previously, as threads were freed from the hot team, they
5627 // would be placed back on the free list in inverse order.  If the hot team
5628 // grew back to it's original size, then the freed thread would be placed
5629 // back on the hot team in reverse order.  This could cause bad cache
5630 // locality problems on programs where the size of the hot team regularly
5631 // grew and shrunk.
5632 //
5633 // Now, for single-level parallelism, the OMP tid is always == gtid.
5634 void __kmp_free_thread(kmp_info_t *this_th) {
5635   int gtid;
5636   kmp_info_t **scan;
5637 
5638   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5639                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5640 
5641   KMP_DEBUG_ASSERT(this_th);
5642 
5643   // When moving thread to pool, switch thread to wait on own b_go flag, and
5644   // uninitialized (NULL team).
5645   int b;
5646   kmp_balign_t *balign = this_th->th.th_bar;
5647   for (b = 0; b < bs_last_barrier; ++b) {
5648     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5649       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5650     balign[b].bb.team = NULL;
5651     balign[b].bb.leaf_kids = 0;
5652   }
5653   this_th->th.th_task_state = 0;
5654   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5655 
5656   /* put thread back on the free pool */
5657   TCW_PTR(this_th->th.th_team, NULL);
5658   TCW_PTR(this_th->th.th_root, NULL);
5659   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5660 
5661   while (this_th->th.th_cg_roots) {
5662     this_th->th.th_cg_roots->cg_nthreads--;
5663     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5664                    " %p of thread  %p to %d\n",
5665                    this_th, this_th->th.th_cg_roots,
5666                    this_th->th.th_cg_roots->cg_root,
5667                    this_th->th.th_cg_roots->cg_nthreads));
5668     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5669     if (tmp->cg_root == this_th) { // Thread is a cg_root
5670       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5671       KA_TRACE(
5672           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5673       this_th->th.th_cg_roots = tmp->up;
5674       __kmp_free(tmp);
5675     } else { // Worker thread
5676       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5677         __kmp_free(tmp);
5678       }
5679       this_th->th.th_cg_roots = NULL;
5680       break;
5681     }
5682   }
5683 
5684   /* If the implicit task assigned to this thread can be used by other threads
5685    * -> multiple threads can share the data and try to free the task at
5686    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5687    * with higher probability when hot team is disabled but can occurs even when
5688    * the hot team is enabled */
5689   __kmp_free_implicit_task(this_th);
5690   this_th->th.th_current_task = NULL;
5691 
5692   // If the __kmp_thread_pool_insert_pt is already past the new insert
5693   // point, then we need to re-scan the entire list.
5694   gtid = this_th->th.th_info.ds.ds_gtid;
5695   if (__kmp_thread_pool_insert_pt != NULL) {
5696     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5697     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5698       __kmp_thread_pool_insert_pt = NULL;
5699     }
5700   }
5701 
5702   // Scan down the list to find the place to insert the thread.
5703   // scan is the address of a link in the list, possibly the address of
5704   // __kmp_thread_pool itself.
5705   //
5706   // In the absence of nested parallelism, the for loop will have 0 iterations.
5707   if (__kmp_thread_pool_insert_pt != NULL) {
5708     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5709   } else {
5710     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5711   }
5712   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5713        scan = &((*scan)->th.th_next_pool))
5714     ;
5715 
5716   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5717   // to its address.
5718   TCW_PTR(this_th->th.th_next_pool, *scan);
5719   __kmp_thread_pool_insert_pt = *scan = this_th;
5720   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5721                    (this_th->th.th_info.ds.ds_gtid <
5722                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5723   TCW_4(this_th->th.th_in_pool, TRUE);
5724   __kmp_suspend_initialize_thread(this_th);
5725   __kmp_lock_suspend_mx(this_th);
5726   if (this_th->th.th_active == TRUE) {
5727     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5728     this_th->th.th_active_in_pool = TRUE;
5729   }
5730 #if KMP_DEBUG
5731   else {
5732     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5733   }
5734 #endif
5735   __kmp_unlock_suspend_mx(this_th);
5736 
5737   TCW_4(__kmp_nth, __kmp_nth - 1);
5738 
5739 #ifdef KMP_ADJUST_BLOCKTIME
5740   /* Adjust blocktime back to user setting or default if necessary */
5741   /* Middle initialization might never have occurred                */
5742   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5743     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5744     if (__kmp_nth <= __kmp_avail_proc) {
5745       __kmp_zero_bt = FALSE;
5746     }
5747   }
5748 #endif /* KMP_ADJUST_BLOCKTIME */
5749 
5750   KMP_MB();
5751 }
5752 
5753 /* ------------------------------------------------------------------------ */
5754 
5755 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5756 #if OMP_PROFILING_SUPPORT
5757   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5758   // TODO: add a configuration option for time granularity
5759   if (ProfileTraceFile)
5760     llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5761 #endif
5762 
5763   int gtid = this_thr->th.th_info.ds.ds_gtid;
5764   /*    void                 *stack_data;*/
5765   kmp_team_t **volatile pteam;
5766 
5767   KMP_MB();
5768   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5769 
5770   if (__kmp_env_consistency_check) {
5771     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5772   }
5773 
5774 #if OMPT_SUPPORT
5775   ompt_data_t *thread_data;
5776   if (ompt_enabled.enabled) {
5777     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5778     *thread_data = ompt_data_none;
5779 
5780     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5781     this_thr->th.ompt_thread_info.wait_id = 0;
5782     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5783     this_thr->th.ompt_thread_info.parallel_flags = 0;
5784     if (ompt_enabled.ompt_callback_thread_begin) {
5785       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5786           ompt_thread_worker, thread_data);
5787     }
5788     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5789   }
5790 #endif
5791 
5792   /* This is the place where threads wait for work */
5793   while (!TCR_4(__kmp_global.g.g_done)) {
5794     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5795     KMP_MB();
5796 
5797     /* wait for work to do */
5798     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5799 
5800     /* No tid yet since not part of a team */
5801     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5802 
5803 #if OMPT_SUPPORT
5804     if (ompt_enabled.enabled) {
5805       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5806     }
5807 #endif
5808 
5809     pteam = &this_thr->th.th_team;
5810 
5811     /* have we been allocated? */
5812     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5813       /* we were just woken up, so run our new task */
5814       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5815         int rc;
5816         KA_TRACE(20,
5817                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5818                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5819                   (*pteam)->t.t_pkfn));
5820 
5821         updateHWFPControl(*pteam);
5822 
5823 #if OMPT_SUPPORT
5824         if (ompt_enabled.enabled) {
5825           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5826         }
5827 #endif
5828 
5829         rc = (*pteam)->t.t_invoke(gtid);
5830         KMP_ASSERT(rc);
5831 
5832         KMP_MB();
5833         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5834                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5835                       (*pteam)->t.t_pkfn));
5836       }
5837 #if OMPT_SUPPORT
5838       if (ompt_enabled.enabled) {
5839         /* no frame set while outside task */
5840         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5841 
5842         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5843       }
5844 #endif
5845       /* join barrier after parallel region */
5846       __kmp_join_barrier(gtid);
5847     }
5848   }
5849   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5850 
5851 #if OMPT_SUPPORT
5852   if (ompt_enabled.ompt_callback_thread_end) {
5853     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5854   }
5855 #endif
5856 
5857   this_thr->th.th_task_team = NULL;
5858   /* run the destructors for the threadprivate data for this thread */
5859   __kmp_common_destroy_gtid(gtid);
5860 
5861   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5862   KMP_MB();
5863 
5864 #if OMP_PROFILING_SUPPORT
5865   llvm::timeTraceProfilerFinishThread();
5866 #endif
5867   return this_thr;
5868 }
5869 
5870 /* ------------------------------------------------------------------------ */
5871 
5872 void __kmp_internal_end_dest(void *specific_gtid) {
5873   // Make sure no significant bits are lost
5874   int gtid;
5875   __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5876 
5877   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5878   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5879    * this is because 0 is reserved for the nothing-stored case */
5880 
5881   __kmp_internal_end_thread(gtid);
5882 }
5883 
5884 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5885 
5886 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5887   __kmp_internal_end_atexit();
5888 }
5889 
5890 #endif
5891 
5892 /* [Windows] josh: when the atexit handler is called, there may still be more
5893    than one thread alive */
5894 void __kmp_internal_end_atexit(void) {
5895   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5896   /* [Windows]
5897      josh: ideally, we want to completely shutdown the library in this atexit
5898      handler, but stat code that depends on thread specific data for gtid fails
5899      because that data becomes unavailable at some point during the shutdown, so
5900      we call __kmp_internal_end_thread instead. We should eventually remove the
5901      dependency on __kmp_get_specific_gtid in the stat code and use
5902      __kmp_internal_end_library to cleanly shutdown the library.
5903 
5904      // TODO: Can some of this comment about GVS be removed?
5905      I suspect that the offending stat code is executed when the calling thread
5906      tries to clean up a dead root thread's data structures, resulting in GVS
5907      code trying to close the GVS structures for that thread, but since the stat
5908      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5909      the calling thread is cleaning up itself instead of another thread, it get
5910      confused. This happens because allowing a thread to unregister and cleanup
5911      another thread is a recent modification for addressing an issue.
5912      Based on the current design (20050722), a thread may end up
5913      trying to unregister another thread only if thread death does not trigger
5914      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5915      thread specific data destructor function to detect thread death. For
5916      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5917      is nothing.  Thus, the workaround is applicable only for Windows static
5918      stat library. */
5919   __kmp_internal_end_library(-1);
5920 #if KMP_OS_WINDOWS
5921   __kmp_close_console();
5922 #endif
5923 }
5924 
5925 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5926   // It is assumed __kmp_forkjoin_lock is acquired.
5927 
5928   int gtid;
5929 
5930   KMP_DEBUG_ASSERT(thread != NULL);
5931 
5932   gtid = thread->th.th_info.ds.ds_gtid;
5933 
5934   if (!is_root) {
5935     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5936       /* Assume the threads are at the fork barrier here */
5937       KA_TRACE(
5938           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5939                gtid));
5940       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5941        * (GEH) */
5942       ANNOTATE_HAPPENS_BEFORE(thread);
5943       kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5944                          thread);
5945       __kmp_release_64(&flag);
5946     }
5947 
5948     // Terminate OS thread.
5949     __kmp_reap_worker(thread);
5950 
5951     // The thread was killed asynchronously.  If it was actively
5952     // spinning in the thread pool, decrement the global count.
5953     //
5954     // There is a small timing hole here - if the worker thread was just waking
5955     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5956     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5957     // the global counter might not get updated.
5958     //
5959     // Currently, this can only happen as the library is unloaded,
5960     // so there are no harmful side effects.
5961     if (thread->th.th_active_in_pool) {
5962       thread->th.th_active_in_pool = FALSE;
5963       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5964       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5965     }
5966   }
5967 
5968   __kmp_free_implicit_task(thread);
5969 
5970 // Free the fast memory for tasking
5971 #if USE_FAST_MEMORY
5972   __kmp_free_fast_memory(thread);
5973 #endif /* USE_FAST_MEMORY */
5974 
5975   __kmp_suspend_uninitialize_thread(thread);
5976 
5977   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5978   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5979 
5980   --__kmp_all_nth;
5981 // __kmp_nth was decremented when thread is added to the pool.
5982 
5983 #ifdef KMP_ADJUST_BLOCKTIME
5984   /* Adjust blocktime back to user setting or default if necessary */
5985   /* Middle initialization might never have occurred                */
5986   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5987     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5988     if (__kmp_nth <= __kmp_avail_proc) {
5989       __kmp_zero_bt = FALSE;
5990     }
5991   }
5992 #endif /* KMP_ADJUST_BLOCKTIME */
5993 
5994   /* free the memory being used */
5995   if (__kmp_env_consistency_check) {
5996     if (thread->th.th_cons) {
5997       __kmp_free_cons_stack(thread->th.th_cons);
5998       thread->th.th_cons = NULL;
5999     }
6000   }
6001 
6002   if (thread->th.th_pri_common != NULL) {
6003     __kmp_free(thread->th.th_pri_common);
6004     thread->th.th_pri_common = NULL;
6005   }
6006 
6007   if (thread->th.th_task_state_memo_stack != NULL) {
6008     __kmp_free(thread->th.th_task_state_memo_stack);
6009     thread->th.th_task_state_memo_stack = NULL;
6010   }
6011 
6012 #if KMP_USE_BGET
6013   if (thread->th.th_local.bget_data != NULL) {
6014     __kmp_finalize_bget(thread);
6015   }
6016 #endif
6017 
6018 #if KMP_AFFINITY_SUPPORTED
6019   if (thread->th.th_affin_mask != NULL) {
6020     KMP_CPU_FREE(thread->th.th_affin_mask);
6021     thread->th.th_affin_mask = NULL;
6022   }
6023 #endif /* KMP_AFFINITY_SUPPORTED */
6024 
6025 #if KMP_USE_HIER_SCHED
6026   if (thread->th.th_hier_bar_data != NULL) {
6027     __kmp_free(thread->th.th_hier_bar_data);
6028     thread->th.th_hier_bar_data = NULL;
6029   }
6030 #endif
6031 
6032   __kmp_reap_team(thread->th.th_serial_team);
6033   thread->th.th_serial_team = NULL;
6034   __kmp_free(thread);
6035 
6036   KMP_MB();
6037 
6038 } // __kmp_reap_thread
6039 
6040 static void __kmp_internal_end(void) {
6041   int i;
6042 
6043   /* First, unregister the library */
6044   __kmp_unregister_library();
6045 
6046 #if KMP_OS_WINDOWS
6047   /* In Win static library, we can't tell when a root actually dies, so we
6048      reclaim the data structures for any root threads that have died but not
6049      unregistered themselves, in order to shut down cleanly.
6050      In Win dynamic library we also can't tell when a thread dies.  */
6051   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6052 // dead roots
6053 #endif
6054 
6055   for (i = 0; i < __kmp_threads_capacity; i++)
6056     if (__kmp_root[i])
6057       if (__kmp_root[i]->r.r_active)
6058         break;
6059   KMP_MB(); /* Flush all pending memory write invalidates.  */
6060   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6061 
6062   if (i < __kmp_threads_capacity) {
6063 #if KMP_USE_MONITOR
6064     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6065     KMP_MB(); /* Flush all pending memory write invalidates.  */
6066 
6067     // Need to check that monitor was initialized before reaping it. If we are
6068     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6069     // __kmp_monitor will appear to contain valid data, but it is only valid in
6070     // the parent process, not the child.
6071     // New behavior (201008): instead of keying off of the flag
6072     // __kmp_init_parallel, the monitor thread creation is keyed off
6073     // of the new flag __kmp_init_monitor.
6074     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6075     if (TCR_4(__kmp_init_monitor)) {
6076       __kmp_reap_monitor(&__kmp_monitor);
6077       TCW_4(__kmp_init_monitor, 0);
6078     }
6079     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6080     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6081 #endif // KMP_USE_MONITOR
6082   } else {
6083 /* TODO move this to cleanup code */
6084 #ifdef KMP_DEBUG
6085     /* make sure that everything has properly ended */
6086     for (i = 0; i < __kmp_threads_capacity; i++) {
6087       if (__kmp_root[i]) {
6088         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6089         //                    there can be uber threads alive here
6090         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6091       }
6092     }
6093 #endif
6094 
6095     KMP_MB();
6096 
6097     // Reap the worker threads.
6098     // This is valid for now, but be careful if threads are reaped sooner.
6099     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6100       // Get the next thread from the pool.
6101       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6102       __kmp_thread_pool = thread->th.th_next_pool;
6103       // Reap it.
6104       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6105       thread->th.th_next_pool = NULL;
6106       thread->th.th_in_pool = FALSE;
6107       __kmp_reap_thread(thread, 0);
6108     }
6109     __kmp_thread_pool_insert_pt = NULL;
6110 
6111     // Reap teams.
6112     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6113       // Get the next team from the pool.
6114       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6115       __kmp_team_pool = team->t.t_next_pool;
6116       // Reap it.
6117       team->t.t_next_pool = NULL;
6118       __kmp_reap_team(team);
6119     }
6120 
6121     __kmp_reap_task_teams();
6122 
6123 #if KMP_OS_UNIX
6124     // Threads that are not reaped should not access any resources since they
6125     // are going to be deallocated soon, so the shutdown sequence should wait
6126     // until all threads either exit the final spin-waiting loop or begin
6127     // sleeping after the given blocktime.
6128     for (i = 0; i < __kmp_threads_capacity; i++) {
6129       kmp_info_t *thr = __kmp_threads[i];
6130       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6131         KMP_CPU_PAUSE();
6132     }
6133 #endif
6134 
6135     for (i = 0; i < __kmp_threads_capacity; ++i) {
6136       // TBD: Add some checking...
6137       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6138     }
6139 
6140     /* Make sure all threadprivate destructors get run by joining with all
6141        worker threads before resetting this flag */
6142     TCW_SYNC_4(__kmp_init_common, FALSE);
6143 
6144     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6145     KMP_MB();
6146 
6147 #if KMP_USE_MONITOR
6148     // See note above: One of the possible fixes for CQ138434 / CQ140126
6149     //
6150     // FIXME: push both code fragments down and CSE them?
6151     // push them into __kmp_cleanup() ?
6152     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6153     if (TCR_4(__kmp_init_monitor)) {
6154       __kmp_reap_monitor(&__kmp_monitor);
6155       TCW_4(__kmp_init_monitor, 0);
6156     }
6157     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6158     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6159 #endif
6160   } /* else !__kmp_global.t_active */
6161   TCW_4(__kmp_init_gtid, FALSE);
6162   KMP_MB(); /* Flush all pending memory write invalidates.  */
6163 
6164   __kmp_cleanup();
6165 #if OMPT_SUPPORT
6166   ompt_fini();
6167 #endif
6168 }
6169 
6170 void __kmp_internal_end_library(int gtid_req) {
6171   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6172   /* this shouldn't be a race condition because __kmp_internal_end() is the
6173      only place to clear __kmp_serial_init */
6174   /* we'll check this later too, after we get the lock */
6175   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6176   // redundant, because the next check will work in any case.
6177   if (__kmp_global.g.g_abort) {
6178     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6179     /* TODO abort? */
6180     return;
6181   }
6182   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6183     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6184     return;
6185   }
6186 
6187   KMP_MB(); /* Flush all pending memory write invalidates.  */
6188   /* find out who we are and what we should do */
6189   {
6190     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6191     KA_TRACE(
6192         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6193     if (gtid == KMP_GTID_SHUTDOWN) {
6194       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6195                     "already shutdown\n"));
6196       return;
6197     } else if (gtid == KMP_GTID_MONITOR) {
6198       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6199                     "registered, or system shutdown\n"));
6200       return;
6201     } else if (gtid == KMP_GTID_DNE) {
6202       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6203                     "shutdown\n"));
6204       /* we don't know who we are, but we may still shutdown the library */
6205     } else if (KMP_UBER_GTID(gtid)) {
6206       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6207       if (__kmp_root[gtid]->r.r_active) {
6208         __kmp_global.g.g_abort = -1;
6209         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6210         __kmp_unregister_library();
6211         KA_TRACE(10,
6212                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6213                   gtid));
6214         return;
6215       } else {
6216         KA_TRACE(
6217             10,
6218             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6219         __kmp_unregister_root_current_thread(gtid);
6220       }
6221     } else {
6222 /* worker threads may call this function through the atexit handler, if they
6223  * call exit() */
6224 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6225    TODO: do a thorough shutdown instead */
6226 #ifdef DUMP_DEBUG_ON_EXIT
6227       if (__kmp_debug_buf)
6228         __kmp_dump_debug_buffer();
6229 #endif
6230       // added unregister library call here when we switch to shm linux
6231       // if we don't, it will leave lots of files in /dev/shm
6232       // cleanup shared memory file before exiting.
6233       __kmp_unregister_library();
6234       return;
6235     }
6236   }
6237   /* synchronize the termination process */
6238   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6239 
6240   /* have we already finished */
6241   if (__kmp_global.g.g_abort) {
6242     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6243     /* TODO abort? */
6244     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6245     return;
6246   }
6247   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6248     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6249     return;
6250   }
6251 
6252   /* We need this lock to enforce mutex between this reading of
6253      __kmp_threads_capacity and the writing by __kmp_register_root.
6254      Alternatively, we can use a counter of roots that is atomically updated by
6255      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6256      __kmp_internal_end_*.  */
6257   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6258 
6259   /* now we can safely conduct the actual termination */
6260   __kmp_internal_end();
6261 
6262   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6263   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6264 
6265   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6266 
6267 #ifdef DUMP_DEBUG_ON_EXIT
6268   if (__kmp_debug_buf)
6269     __kmp_dump_debug_buffer();
6270 #endif
6271 
6272 #if KMP_OS_WINDOWS
6273   __kmp_close_console();
6274 #endif
6275 
6276   __kmp_fini_allocator();
6277 
6278 } // __kmp_internal_end_library
6279 
6280 void __kmp_internal_end_thread(int gtid_req) {
6281   int i;
6282 
6283   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6284   /* this shouldn't be a race condition because __kmp_internal_end() is the
6285    * only place to clear __kmp_serial_init */
6286   /* we'll check this later too, after we get the lock */
6287   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6288   // redundant, because the next check will work in any case.
6289   if (__kmp_global.g.g_abort) {
6290     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6291     /* TODO abort? */
6292     return;
6293   }
6294   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6295     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6296     return;
6297   }
6298 
6299   // If hidden helper team has been initialized, we need to deinit it
6300   if (TCR_4(__kmp_init_hidden_helper)) {
6301     TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6302     // First release the main thread to let it continue its work
6303     __kmp_hidden_helper_main_thread_release();
6304     // Wait until the hidden helper team has been destroyed
6305     __kmp_hidden_helper_threads_deinitz_wait();
6306   }
6307 
6308   KMP_MB(); /* Flush all pending memory write invalidates.  */
6309 
6310   /* find out who we are and what we should do */
6311   {
6312     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6313     KA_TRACE(10,
6314              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6315     if (gtid == KMP_GTID_SHUTDOWN) {
6316       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6317                     "already shutdown\n"));
6318       return;
6319     } else if (gtid == KMP_GTID_MONITOR) {
6320       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6321                     "registered, or system shutdown\n"));
6322       return;
6323     } else if (gtid == KMP_GTID_DNE) {
6324       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6325                     "shutdown\n"));
6326       return;
6327       /* we don't know who we are */
6328     } else if (KMP_UBER_GTID(gtid)) {
6329       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6330       if (__kmp_root[gtid]->r.r_active) {
6331         __kmp_global.g.g_abort = -1;
6332         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333         KA_TRACE(10,
6334                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6335                   gtid));
6336         return;
6337       } else {
6338         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6339                       gtid));
6340         __kmp_unregister_root_current_thread(gtid);
6341       }
6342     } else {
6343       /* just a worker thread, let's leave */
6344       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6345 
6346       if (gtid >= 0) {
6347         __kmp_threads[gtid]->th.th_task_team = NULL;
6348       }
6349 
6350       KA_TRACE(10,
6351                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6352                 gtid));
6353       return;
6354     }
6355   }
6356 #if KMP_DYNAMIC_LIB
6357   if (__kmp_pause_status != kmp_hard_paused)
6358   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6359   // because we will better shutdown later in the library destructor.
6360   {
6361     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6362     return;
6363   }
6364 #endif
6365   /* synchronize the termination process */
6366   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6367 
6368   /* have we already finished */
6369   if (__kmp_global.g.g_abort) {
6370     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6371     /* TODO abort? */
6372     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6373     return;
6374   }
6375   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6376     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6377     return;
6378   }
6379 
6380   /* We need this lock to enforce mutex between this reading of
6381      __kmp_threads_capacity and the writing by __kmp_register_root.
6382      Alternatively, we can use a counter of roots that is atomically updated by
6383      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6384      __kmp_internal_end_*.  */
6385 
6386   /* should we finish the run-time?  are all siblings done? */
6387   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6388 
6389   for (i = 0; i < __kmp_threads_capacity; ++i) {
6390     if (KMP_UBER_GTID(i)) {
6391       KA_TRACE(
6392           10,
6393           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6394       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6395       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6396       return;
6397     }
6398   }
6399 
6400   /* now we can safely conduct the actual termination */
6401 
6402   __kmp_internal_end();
6403 
6404   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6405   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6406 
6407   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6408 
6409 #ifdef DUMP_DEBUG_ON_EXIT
6410   if (__kmp_debug_buf)
6411     __kmp_dump_debug_buffer();
6412 #endif
6413 } // __kmp_internal_end_thread
6414 
6415 // -----------------------------------------------------------------------------
6416 // Library registration stuff.
6417 
6418 static long __kmp_registration_flag = 0;
6419 // Random value used to indicate library initialization.
6420 static char *__kmp_registration_str = NULL;
6421 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6422 
6423 static inline char *__kmp_reg_status_name() {
6424 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6425    each thread. If registration and unregistration go in different threads
6426    (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6427    env var can not be found, because the name will contain different pid. */
6428 // macOS* complains about name being too long with additional getuid()
6429 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6430   return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6431                           (int)getuid());
6432 #else
6433   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6434 #endif
6435 } // __kmp_reg_status_get
6436 
6437 void __kmp_register_library_startup(void) {
6438 
6439   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6440   int done = 0;
6441   union {
6442     double dtime;
6443     long ltime;
6444   } time;
6445 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6446   __kmp_initialize_system_tick();
6447 #endif
6448   __kmp_read_system_time(&time.dtime);
6449   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6450   __kmp_registration_str =
6451       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6452                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6453 
6454   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6455                 __kmp_registration_str));
6456 
6457   while (!done) {
6458 
6459     char *value = NULL; // Actual value of the environment variable.
6460 
6461 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6462     char *shm_name = __kmp_str_format("/%s", name);
6463     int shm_preexist = 0;
6464     char *data1;
6465     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6466     if ((fd1 == -1) && (errno == EEXIST)) {
6467       // file didn't open because it already exists.
6468       // try opening existing file
6469       fd1 = shm_open(shm_name, O_RDWR, 0666);
6470       if (fd1 == -1) { // file didn't open
6471         // error out here
6472         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6473                     __kmp_msg_null);
6474       } else {
6475         // able to open existing file
6476         shm_preexist = 1;
6477       }
6478     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6479       // already exists.
6480       // error out here.
6481       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6482                   __kmp_msg_null);
6483     }
6484     if (shm_preexist == 0) {
6485       // we created SHM now set size
6486       if (ftruncate(fd1, SHM_SIZE) == -1) {
6487         // error occured setting size;
6488         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6489                     KMP_ERR(errno), __kmp_msg_null);
6490       }
6491     }
6492     data1 =
6493         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6494     if (data1 == MAP_FAILED) {
6495       // failed to map shared memory
6496       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6497                   __kmp_msg_null);
6498     }
6499     if (shm_preexist == 0) { // set data to SHM, set value
6500       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6501     }
6502     // Read value from either what we just wrote or existing file.
6503     value = __kmp_str_format("%s", data1); // read value from SHM
6504     munmap(data1, SHM_SIZE);
6505     close(fd1);
6506 #else // Windows and unix with static library
6507     // Set environment variable, but do not overwrite if it is exist.
6508     __kmp_env_set(name, __kmp_registration_str, 0);
6509     // read value to see if it got set
6510     value = __kmp_env_get(name);
6511 #endif
6512 
6513     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6514       done = 1; // Ok, environment variable set successfully, exit the loop.
6515     } else {
6516       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6517       // Check whether it alive or dead.
6518       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6519       char *tail = value;
6520       char *flag_addr_str = NULL;
6521       char *flag_val_str = NULL;
6522       char const *file_name = NULL;
6523       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6524       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6525       file_name = tail;
6526       if (tail != NULL) {
6527         long *flag_addr = 0;
6528         long flag_val = 0;
6529         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6530         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6531         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6532           // First, check whether environment-encoded address is mapped into
6533           // addr space.
6534           // If so, dereference it to see if it still has the right value.
6535           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6536             neighbor = 1;
6537           } else {
6538             // If not, then we know the other copy of the library is no longer
6539             // running.
6540             neighbor = 2;
6541           }
6542         }
6543       }
6544       switch (neighbor) {
6545       case 0: // Cannot parse environment variable -- neighbor status unknown.
6546         // Assume it is the incompatible format of future version of the
6547         // library. Assume the other library is alive.
6548         // WARN( ... ); // TODO: Issue a warning.
6549         file_name = "unknown library";
6550         KMP_FALLTHROUGH();
6551       // Attention! Falling to the next case. That's intentional.
6552       case 1: { // Neighbor is alive.
6553         // Check it is allowed.
6554         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6555         if (!__kmp_str_match_true(duplicate_ok)) {
6556           // That's not allowed. Issue fatal error.
6557           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6558                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6559         }
6560         KMP_INTERNAL_FREE(duplicate_ok);
6561         __kmp_duplicate_library_ok = 1;
6562         done = 1; // Exit the loop.
6563       } break;
6564       case 2: { // Neighbor is dead.
6565 
6566 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6567         // close shared memory.
6568         shm_unlink(shm_name); // this removes file in /dev/shm
6569 #else
6570         // Clear the variable and try to register library again.
6571         __kmp_env_unset(name);
6572 #endif
6573       } break;
6574       default: { KMP_DEBUG_ASSERT(0); } break;
6575       }
6576     }
6577     KMP_INTERNAL_FREE((void *)value);
6578 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6579     KMP_INTERNAL_FREE((void *)shm_name);
6580 #endif
6581   } // while
6582   KMP_INTERNAL_FREE((void *)name);
6583 
6584 } // func __kmp_register_library_startup
6585 
6586 void __kmp_unregister_library(void) {
6587 
6588   char *name = __kmp_reg_status_name();
6589   char *value = NULL;
6590 
6591 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6592   char *shm_name = __kmp_str_format("/%s", name);
6593   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6594   if (fd1 == -1) {
6595     // file did not open. return.
6596     return;
6597   }
6598   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6599   if (data1 != MAP_FAILED) {
6600     value = __kmp_str_format("%s", data1); // read value from SHM
6601     munmap(data1, SHM_SIZE);
6602   }
6603   close(fd1);
6604 #else
6605   value = __kmp_env_get(name);
6606 #endif
6607 
6608   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6609   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6610   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6611 //  Ok, this is our variable. Delete it.
6612 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6613     shm_unlink(shm_name); // this removes file in /dev/shm
6614 #else
6615     __kmp_env_unset(name);
6616 #endif
6617   }
6618 
6619 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6620   KMP_INTERNAL_FREE(shm_name);
6621 #endif
6622 
6623   KMP_INTERNAL_FREE(__kmp_registration_str);
6624   KMP_INTERNAL_FREE(value);
6625   KMP_INTERNAL_FREE(name);
6626 
6627   __kmp_registration_flag = 0;
6628   __kmp_registration_str = NULL;
6629 
6630 } // __kmp_unregister_library
6631 
6632 // End of Library registration stuff.
6633 // -----------------------------------------------------------------------------
6634 
6635 #if KMP_MIC_SUPPORTED
6636 
6637 static void __kmp_check_mic_type() {
6638   kmp_cpuid_t cpuid_state = {0};
6639   kmp_cpuid_t *cs_p = &cpuid_state;
6640   __kmp_x86_cpuid(1, 0, cs_p);
6641   // We don't support mic1 at the moment
6642   if ((cs_p->eax & 0xff0) == 0xB10) {
6643     __kmp_mic_type = mic2;
6644   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6645     __kmp_mic_type = mic3;
6646   } else {
6647     __kmp_mic_type = non_mic;
6648   }
6649 }
6650 
6651 #endif /* KMP_MIC_SUPPORTED */
6652 
6653 #if KMP_HAVE_UMWAIT
6654 static void __kmp_user_level_mwait_init() {
6655   struct kmp_cpuid buf;
6656   __kmp_x86_cpuid(7, 0, &buf);
6657   __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6658   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6659                 __kmp_umwait_enabled));
6660 }
6661 #elif KMP_HAVE_MWAIT
6662 #ifndef AT_INTELPHIUSERMWAIT
6663 // Spurious, non-existent value that should always fail to return anything.
6664 // Will be replaced with the correct value when we know that.
6665 #define AT_INTELPHIUSERMWAIT 10000
6666 #endif
6667 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6668 // earlier OS is used to build the RTL, we'll use the following internal
6669 // function when the entry is not found.
6670 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6671 unsigned long getauxval(unsigned long) { return 0; }
6672 
6673 static void __kmp_user_level_mwait_init() {
6674   // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6675   // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6676   // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6677   // KMP_USER_LEVEL_MWAIT was set to TRUE.
6678   if (__kmp_mic_type == mic3) {
6679     unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6680     if ((res & 0x1) || __kmp_user_level_mwait) {
6681       __kmp_mwait_enabled = TRUE;
6682       if (__kmp_user_level_mwait) {
6683         KMP_INFORM(EnvMwaitWarn);
6684       }
6685     } else {
6686       __kmp_mwait_enabled = FALSE;
6687     }
6688   }
6689   KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6690                 "__kmp_mwait_enabled = %d\n",
6691                 __kmp_mic_type, __kmp_mwait_enabled));
6692 }
6693 #endif /* KMP_HAVE_UMWAIT */
6694 
6695 static void __kmp_do_serial_initialize(void) {
6696   int i, gtid;
6697   size_t size;
6698 
6699   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6700 
6701   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6702   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6703   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6704   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6705   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6706 
6707 #if OMPT_SUPPORT
6708   ompt_pre_init();
6709 #endif
6710 
6711   __kmp_validate_locks();
6712 
6713   /* Initialize internal memory allocator */
6714   __kmp_init_allocator();
6715 
6716   /* Register the library startup via an environment variable and check to see
6717      whether another copy of the library is already registered. */
6718 
6719   __kmp_register_library_startup();
6720 
6721   /* TODO reinitialization of library */
6722   if (TCR_4(__kmp_global.g.g_done)) {
6723     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6724   }
6725 
6726   __kmp_global.g.g_abort = 0;
6727   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6728 
6729 /* initialize the locks */
6730 #if KMP_USE_ADAPTIVE_LOCKS
6731 #if KMP_DEBUG_ADAPTIVE_LOCKS
6732   __kmp_init_speculative_stats();
6733 #endif
6734 #endif
6735 #if KMP_STATS_ENABLED
6736   __kmp_stats_init();
6737 #endif
6738   __kmp_init_lock(&__kmp_global_lock);
6739   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6740   __kmp_init_lock(&__kmp_debug_lock);
6741   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6742   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6743   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6744   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6745   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6746   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6747   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6748   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6749   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6750   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6751   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6752   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6753   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6754   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6755   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6756 #if KMP_USE_MONITOR
6757   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6758 #endif
6759   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6760 
6761   /* conduct initialization and initial setup of configuration */
6762 
6763   __kmp_runtime_initialize();
6764 
6765 #if KMP_MIC_SUPPORTED
6766   __kmp_check_mic_type();
6767 #endif
6768 
6769 // Some global variable initialization moved here from kmp_env_initialize()
6770 #ifdef KMP_DEBUG
6771   kmp_diag = 0;
6772 #endif
6773   __kmp_abort_delay = 0;
6774 
6775   // From __kmp_init_dflt_team_nth()
6776   /* assume the entire machine will be used */
6777   __kmp_dflt_team_nth_ub = __kmp_xproc;
6778   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6779     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6780   }
6781   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6782     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6783   }
6784   __kmp_max_nth = __kmp_sys_max_nth;
6785   __kmp_cg_max_nth = __kmp_sys_max_nth;
6786   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6787   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6788     __kmp_teams_max_nth = __kmp_sys_max_nth;
6789   }
6790 
6791   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6792   // part
6793   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6794 #if KMP_USE_MONITOR
6795   __kmp_monitor_wakeups =
6796       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6797   __kmp_bt_intervals =
6798       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6799 #endif
6800   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6801   __kmp_library = library_throughput;
6802   // From KMP_SCHEDULE initialization
6803   __kmp_static = kmp_sch_static_balanced;
6804 // AC: do not use analytical here, because it is non-monotonous
6805 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6806 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6807 // need to repeat assignment
6808 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6809 // bit control and barrier method control parts
6810 #if KMP_FAST_REDUCTION_BARRIER
6811 #define kmp_reduction_barrier_gather_bb ((int)1)
6812 #define kmp_reduction_barrier_release_bb ((int)1)
6813 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6814 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6815 #endif // KMP_FAST_REDUCTION_BARRIER
6816   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6817     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6818     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6819     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6820     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6821 #if KMP_FAST_REDUCTION_BARRIER
6822     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6823       // lin_64 ): hyper,1
6824       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6825       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6826       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6827       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6828     }
6829 #endif // KMP_FAST_REDUCTION_BARRIER
6830   }
6831 #if KMP_FAST_REDUCTION_BARRIER
6832 #undef kmp_reduction_barrier_release_pat
6833 #undef kmp_reduction_barrier_gather_pat
6834 #undef kmp_reduction_barrier_release_bb
6835 #undef kmp_reduction_barrier_gather_bb
6836 #endif // KMP_FAST_REDUCTION_BARRIER
6837 #if KMP_MIC_SUPPORTED
6838   if (__kmp_mic_type == mic2) { // KNC
6839     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6840     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6841     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6842         1; // forkjoin release
6843     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6844     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6845   }
6846 #if KMP_FAST_REDUCTION_BARRIER
6847   if (__kmp_mic_type == mic2) { // KNC
6848     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6849     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6850   }
6851 #endif // KMP_FAST_REDUCTION_BARRIER
6852 #endif // KMP_MIC_SUPPORTED
6853 
6854 // From KMP_CHECKS initialization
6855 #ifdef KMP_DEBUG
6856   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6857 #else
6858   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6859 #endif
6860 
6861   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6862   __kmp_foreign_tp = TRUE;
6863 
6864   __kmp_global.g.g_dynamic = FALSE;
6865   __kmp_global.g.g_dynamic_mode = dynamic_default;
6866 
6867   __kmp_env_initialize(NULL);
6868 
6869 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6870   __kmp_user_level_mwait_init();
6871 #endif
6872 // Print all messages in message catalog for testing purposes.
6873 #ifdef KMP_DEBUG
6874   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6875   if (__kmp_str_match_true(val)) {
6876     kmp_str_buf_t buffer;
6877     __kmp_str_buf_init(&buffer);
6878     __kmp_i18n_dump_catalog(&buffer);
6879     __kmp_printf("%s", buffer.str);
6880     __kmp_str_buf_free(&buffer);
6881   }
6882   __kmp_env_free(&val);
6883 #endif
6884 
6885   __kmp_threads_capacity =
6886       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6887   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6888   __kmp_tp_capacity = __kmp_default_tp_capacity(
6889       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6890 
6891   // If the library is shut down properly, both pools must be NULL. Just in
6892   // case, set them to NULL -- some memory may leak, but subsequent code will
6893   // work even if pools are not freed.
6894   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6895   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6896   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6897   __kmp_thread_pool = NULL;
6898   __kmp_thread_pool_insert_pt = NULL;
6899   __kmp_team_pool = NULL;
6900 
6901   /* Allocate all of the variable sized records */
6902   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6903    * expandable */
6904   /* Since allocation is cache-aligned, just add extra padding at the end */
6905   size =
6906       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6907       CACHE_LINE;
6908   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6909   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6910                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6911 
6912   /* init thread counts */
6913   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6914                    0); // Asserts fail if the library is reinitializing and
6915   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6916   __kmp_all_nth = 0;
6917   __kmp_nth = 0;
6918 
6919   /* setup the uber master thread and hierarchy */
6920   gtid = __kmp_register_root(TRUE);
6921   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6922   KMP_ASSERT(KMP_UBER_GTID(gtid));
6923   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6924 
6925   KMP_MB(); /* Flush all pending memory write invalidates.  */
6926 
6927   __kmp_common_initialize();
6928 
6929 #if KMP_OS_UNIX
6930   /* invoke the child fork handler */
6931   __kmp_register_atfork();
6932 #endif
6933 
6934 #if !KMP_DYNAMIC_LIB
6935   {
6936     /* Invoke the exit handler when the program finishes, only for static
6937        library. For dynamic library, we already have _fini and DllMain. */
6938     int rc = atexit(__kmp_internal_end_atexit);
6939     if (rc != 0) {
6940       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6941                   __kmp_msg_null);
6942     }
6943   }
6944 #endif
6945 
6946 #if KMP_HANDLE_SIGNALS
6947 #if KMP_OS_UNIX
6948   /* NOTE: make sure that this is called before the user installs their own
6949      signal handlers so that the user handlers are called first. this way they
6950      can return false, not call our handler, avoid terminating the library, and
6951      continue execution where they left off. */
6952   __kmp_install_signals(FALSE);
6953 #endif /* KMP_OS_UNIX */
6954 #if KMP_OS_WINDOWS
6955   __kmp_install_signals(TRUE);
6956 #endif /* KMP_OS_WINDOWS */
6957 #endif
6958 
6959   /* we have finished the serial initialization */
6960   __kmp_init_counter++;
6961 
6962   __kmp_init_serial = TRUE;
6963 
6964   if (__kmp_settings) {
6965     __kmp_env_print();
6966   }
6967 
6968   if (__kmp_display_env || __kmp_display_env_verbose) {
6969     __kmp_env_print_2();
6970   }
6971 
6972 #if OMPT_SUPPORT
6973   ompt_post_init();
6974 #endif
6975 
6976   KMP_MB();
6977 
6978   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6979 }
6980 
6981 void __kmp_serial_initialize(void) {
6982   if (__kmp_init_serial) {
6983     return;
6984   }
6985   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6986   if (__kmp_init_serial) {
6987     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6988     return;
6989   }
6990   __kmp_do_serial_initialize();
6991   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6992 }
6993 
6994 static void __kmp_do_middle_initialize(void) {
6995   int i, j;
6996   int prev_dflt_team_nth;
6997 
6998   if (!__kmp_init_serial) {
6999     __kmp_do_serial_initialize();
7000   }
7001 
7002   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7003 
7004   // Save the previous value for the __kmp_dflt_team_nth so that
7005   // we can avoid some reinitialization if it hasn't changed.
7006   prev_dflt_team_nth = __kmp_dflt_team_nth;
7007 
7008 #if KMP_AFFINITY_SUPPORTED
7009   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7010   // number of cores on the machine.
7011   __kmp_affinity_initialize();
7012 
7013   // Run through the __kmp_threads array and set the affinity mask
7014   // for each root thread that is currently registered with the RTL.
7015   for (i = 0; i < __kmp_threads_capacity; i++) {
7016     if (TCR_PTR(__kmp_threads[i]) != NULL) {
7017       __kmp_affinity_set_init_mask(i, TRUE);
7018     }
7019   }
7020 #endif /* KMP_AFFINITY_SUPPORTED */
7021 
7022   KMP_ASSERT(__kmp_xproc > 0);
7023   if (__kmp_avail_proc == 0) {
7024     __kmp_avail_proc = __kmp_xproc;
7025   }
7026 
7027   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7028   // correct them now
7029   j = 0;
7030   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7031     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7032         __kmp_avail_proc;
7033     j++;
7034   }
7035 
7036   if (__kmp_dflt_team_nth == 0) {
7037 #ifdef KMP_DFLT_NTH_CORES
7038     // Default #threads = #cores
7039     __kmp_dflt_team_nth = __kmp_ncores;
7040     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7041                   "__kmp_ncores (%d)\n",
7042                   __kmp_dflt_team_nth));
7043 #else
7044     // Default #threads = #available OS procs
7045     __kmp_dflt_team_nth = __kmp_avail_proc;
7046     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7047                   "__kmp_avail_proc(%d)\n",
7048                   __kmp_dflt_team_nth));
7049 #endif /* KMP_DFLT_NTH_CORES */
7050   }
7051 
7052   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7053     __kmp_dflt_team_nth = KMP_MIN_NTH;
7054   }
7055   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7056     __kmp_dflt_team_nth = __kmp_sys_max_nth;
7057   }
7058 
7059   // There's no harm in continuing if the following check fails,
7060   // but it indicates an error in the previous logic.
7061   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7062 
7063   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7064     // Run through the __kmp_threads array and set the num threads icv for each
7065     // root thread that is currently registered with the RTL (which has not
7066     // already explicitly set its nthreads-var with a call to
7067     // omp_set_num_threads()).
7068     for (i = 0; i < __kmp_threads_capacity; i++) {
7069       kmp_info_t *thread = __kmp_threads[i];
7070       if (thread == NULL)
7071         continue;
7072       if (thread->th.th_current_task->td_icvs.nproc != 0)
7073         continue;
7074 
7075       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7076     }
7077   }
7078   KA_TRACE(
7079       20,
7080       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7081        __kmp_dflt_team_nth));
7082 
7083 #ifdef KMP_ADJUST_BLOCKTIME
7084   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7085   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7086     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7087     if (__kmp_nth > __kmp_avail_proc) {
7088       __kmp_zero_bt = TRUE;
7089     }
7090   }
7091 #endif /* KMP_ADJUST_BLOCKTIME */
7092 
7093   /* we have finished middle initialization */
7094   TCW_SYNC_4(__kmp_init_middle, TRUE);
7095 
7096   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7097 }
7098 
7099 void __kmp_middle_initialize(void) {
7100   if (__kmp_init_middle) {
7101     return;
7102   }
7103   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7104   if (__kmp_init_middle) {
7105     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7106     return;
7107   }
7108   __kmp_do_middle_initialize();
7109   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7110 }
7111 
7112 void __kmp_parallel_initialize(void) {
7113   int gtid = __kmp_entry_gtid(); // this might be a new root
7114 
7115   /* synchronize parallel initialization (for sibling) */
7116   if (TCR_4(__kmp_init_parallel))
7117     return;
7118   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7119   if (TCR_4(__kmp_init_parallel)) {
7120     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7121     return;
7122   }
7123 
7124   /* TODO reinitialization after we have already shut down */
7125   if (TCR_4(__kmp_global.g.g_done)) {
7126     KA_TRACE(
7127         10,
7128         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7129     __kmp_infinite_loop();
7130   }
7131 
7132   /* jc: The lock __kmp_initz_lock is already held, so calling
7133      __kmp_serial_initialize would cause a deadlock.  So we call
7134      __kmp_do_serial_initialize directly. */
7135   if (!__kmp_init_middle) {
7136     __kmp_do_middle_initialize();
7137   }
7138   __kmp_resume_if_hard_paused();
7139 
7140   /* begin initialization */
7141   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7142   KMP_ASSERT(KMP_UBER_GTID(gtid));
7143 
7144 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7145   // Save the FP control regs.
7146   // Worker threads will set theirs to these values at thread startup.
7147   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7148   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7149   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7150 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7151 
7152 #if KMP_OS_UNIX
7153 #if KMP_HANDLE_SIGNALS
7154   /*  must be after __kmp_serial_initialize  */
7155   __kmp_install_signals(TRUE);
7156 #endif
7157 #endif
7158 
7159   __kmp_suspend_initialize();
7160 
7161 #if defined(USE_LOAD_BALANCE)
7162   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7163     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7164   }
7165 #else
7166   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7167     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7168   }
7169 #endif
7170 
7171   if (__kmp_version) {
7172     __kmp_print_version_2();
7173   }
7174 
7175   /* we have finished parallel initialization */
7176   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7177 
7178   KMP_MB();
7179   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7180 
7181   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7182 }
7183 
7184 void __kmp_hidden_helper_initialize() {
7185   if (TCR_4(__kmp_init_hidden_helper))
7186     return;
7187 
7188   // __kmp_parallel_initialize is required before we initialize hidden helper
7189   if (!TCR_4(__kmp_init_parallel))
7190     __kmp_parallel_initialize();
7191 
7192   // Double check. Note that this double check should not be placed before
7193   // __kmp_parallel_initialize as it will cause dead lock.
7194   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7195   if (TCR_4(__kmp_init_hidden_helper)) {
7196     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7197     return;
7198   }
7199 
7200   // Set the count of hidden helper tasks to be executed to zero
7201   KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7202 
7203   // Set the global variable indicating that we're initializing hidden helper
7204   // team/threads
7205   TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7206 
7207   // Platform independent initialization
7208   __kmp_do_initialize_hidden_helper_threads();
7209 
7210   // Wait here for the finish of initialization of hidden helper teams
7211   __kmp_hidden_helper_threads_initz_wait();
7212 
7213   // We have finished hidden helper initialization
7214   TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7215 
7216   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7217 }
7218 
7219 /* ------------------------------------------------------------------------ */
7220 
7221 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7222                                    kmp_team_t *team) {
7223   kmp_disp_t *dispatch;
7224 
7225   KMP_MB();
7226 
7227   /* none of the threads have encountered any constructs, yet. */
7228   this_thr->th.th_local.this_construct = 0;
7229 #if KMP_CACHE_MANAGE
7230   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7231 #endif /* KMP_CACHE_MANAGE */
7232   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7233   KMP_DEBUG_ASSERT(dispatch);
7234   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7235   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7236   // this_thr->th.th_info.ds.ds_tid ] );
7237 
7238   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7239   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7240   if (__kmp_env_consistency_check)
7241     __kmp_push_parallel(gtid, team->t.t_ident);
7242 
7243   KMP_MB(); /* Flush all pending memory write invalidates.  */
7244 }
7245 
7246 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7247                                   kmp_team_t *team) {
7248   if (__kmp_env_consistency_check)
7249     __kmp_pop_parallel(gtid, team->t.t_ident);
7250 
7251   __kmp_finish_implicit_task(this_thr);
7252 }
7253 
7254 int __kmp_invoke_task_func(int gtid) {
7255   int rc;
7256   int tid = __kmp_tid_from_gtid(gtid);
7257   kmp_info_t *this_thr = __kmp_threads[gtid];
7258   kmp_team_t *team = this_thr->th.th_team;
7259 
7260   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7261 #if USE_ITT_BUILD
7262   if (__itt_stack_caller_create_ptr) {
7263     __kmp_itt_stack_callee_enter(
7264         (__itt_caller)
7265             team->t.t_stack_id); // inform ittnotify about entering user's code
7266   }
7267 #endif /* USE_ITT_BUILD */
7268 #if INCLUDE_SSC_MARKS
7269   SSC_MARK_INVOKING();
7270 #endif
7271 
7272 #if OMPT_SUPPORT
7273   void *dummy;
7274   void **exit_frame_p;
7275   ompt_data_t *my_task_data;
7276   ompt_data_t *my_parallel_data;
7277   int ompt_team_size;
7278 
7279   if (ompt_enabled.enabled) {
7280     exit_frame_p = &(
7281         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7282   } else {
7283     exit_frame_p = &dummy;
7284   }
7285 
7286   my_task_data =
7287       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7288   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7289   if (ompt_enabled.ompt_callback_implicit_task) {
7290     ompt_team_size = team->t.t_nproc;
7291     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7292         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7293         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7294     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7295   }
7296 #endif
7297 
7298 #if KMP_STATS_ENABLED
7299   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7300   if (previous_state == stats_state_e::TEAMS_REGION) {
7301     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7302   } else {
7303     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7304   }
7305   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7306 #endif
7307 
7308   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7309                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7310 #if OMPT_SUPPORT
7311                               ,
7312                               exit_frame_p
7313 #endif
7314                               );
7315 #if OMPT_SUPPORT
7316   *exit_frame_p = NULL;
7317    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7318 #endif
7319 
7320 #if KMP_STATS_ENABLED
7321   if (previous_state == stats_state_e::TEAMS_REGION) {
7322     KMP_SET_THREAD_STATE(previous_state);
7323   }
7324   KMP_POP_PARTITIONED_TIMER();
7325 #endif
7326 
7327 #if USE_ITT_BUILD
7328   if (__itt_stack_caller_create_ptr) {
7329     __kmp_itt_stack_callee_leave(
7330         (__itt_caller)
7331             team->t.t_stack_id); // inform ittnotify about leaving user's code
7332   }
7333 #endif /* USE_ITT_BUILD */
7334   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7335 
7336   return rc;
7337 }
7338 
7339 void __kmp_teams_master(int gtid) {
7340   // This routine is called by all master threads in teams construct
7341   kmp_info_t *thr = __kmp_threads[gtid];
7342   kmp_team_t *team = thr->th.th_team;
7343   ident_t *loc = team->t.t_ident;
7344   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7345   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7346   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7347   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7348                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7349 
7350   // This thread is a new CG root.  Set up the proper variables.
7351   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7352   tmp->cg_root = thr; // Make thr the CG root
7353   // Init to thread limit that was stored when league masters were forked
7354   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7355   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7356   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7357                  " cg_nthreads to 1\n",
7358                  thr, tmp));
7359   tmp->up = thr->th.th_cg_roots;
7360   thr->th.th_cg_roots = tmp;
7361 
7362 // Launch league of teams now, but not let workers execute
7363 // (they hang on fork barrier until next parallel)
7364 #if INCLUDE_SSC_MARKS
7365   SSC_MARK_FORKING();
7366 #endif
7367   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7368                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7369                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7370 #if INCLUDE_SSC_MARKS
7371   SSC_MARK_JOINING();
7372 #endif
7373   // If the team size was reduced from the limit, set it to the new size
7374   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7375     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7376   // AC: last parameter "1" eliminates join barrier which won't work because
7377   // worker threads are in a fork barrier waiting for more parallel regions
7378   __kmp_join_call(loc, gtid
7379 #if OMPT_SUPPORT
7380                   ,
7381                   fork_context_intel
7382 #endif
7383                   ,
7384                   1);
7385 }
7386 
7387 int __kmp_invoke_teams_master(int gtid) {
7388   kmp_info_t *this_thr = __kmp_threads[gtid];
7389   kmp_team_t *team = this_thr->th.th_team;
7390 #if KMP_DEBUG
7391   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7392     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7393                      (void *)__kmp_teams_master);
7394 #endif
7395   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7396 #if OMPT_SUPPORT
7397   int tid = __kmp_tid_from_gtid(gtid);
7398   ompt_data_t *task_data =
7399       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7400   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7401   if (ompt_enabled.ompt_callback_implicit_task) {
7402     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7403         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7404         ompt_task_initial);
7405     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7406   }
7407 #endif
7408   __kmp_teams_master(gtid);
7409 #if OMPT_SUPPORT
7410   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7411 #endif
7412   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7413   return 1;
7414 }
7415 
7416 /* this sets the requested number of threads for the next parallel region
7417    encountered by this team. since this should be enclosed in the forkjoin
7418    critical section it should avoid race conditions with asymmetrical nested
7419    parallelism */
7420 
7421 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7422   kmp_info_t *thr = __kmp_threads[gtid];
7423 
7424   if (num_threads > 0)
7425     thr->th.th_set_nproc = num_threads;
7426 }
7427 
7428 /* this sets the requested number of teams for the teams region and/or
7429    the number of threads for the next parallel region encountered  */
7430 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7431                           int num_threads) {
7432   kmp_info_t *thr = __kmp_threads[gtid];
7433   KMP_DEBUG_ASSERT(num_teams >= 0);
7434   KMP_DEBUG_ASSERT(num_threads >= 0);
7435 
7436   if (num_teams == 0) {
7437     if (__kmp_nteams > 0) {
7438       num_teams = __kmp_nteams;
7439     } else {
7440       num_teams = 1; // default number of teams is 1.
7441     }
7442   }
7443   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7444     if (!__kmp_reserve_warn) {
7445       __kmp_reserve_warn = 1;
7446       __kmp_msg(kmp_ms_warning,
7447                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7448                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7449     }
7450     num_teams = __kmp_teams_max_nth;
7451   }
7452   // Set number of teams (number of threads in the outer "parallel" of the
7453   // teams)
7454   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7455 
7456   // Remember the number of threads for inner parallel regions
7457   if (!TCR_4(__kmp_init_middle))
7458     __kmp_middle_initialize(); // get internal globals calculated
7459   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7460   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7461   if (num_threads == 0) {
7462     if (__kmp_teams_thread_limit > 0) {
7463       num_threads = __kmp_teams_thread_limit;
7464     } else {
7465       num_threads = __kmp_avail_proc / num_teams;
7466     }
7467     // adjust num_threads w/o warning as it is not user setting
7468     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7469     // no thread_limit clause specified -  do not change thread-limit-var ICV
7470     if (num_threads > __kmp_dflt_team_nth) {
7471       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7472     }
7473     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7474       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7475     } // prevent team size to exceed thread-limit-var
7476     if (num_teams * num_threads > __kmp_teams_max_nth) {
7477       num_threads = __kmp_teams_max_nth / num_teams;
7478     }
7479   } else {
7480     // This thread will be the master of the league masters
7481     // Store new thread limit; old limit is saved in th_cg_roots list
7482     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7483     // num_threads = min(num_threads, nthreads-var)
7484     if (num_threads > __kmp_dflt_team_nth) {
7485       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7486     }
7487     if (num_teams * num_threads > __kmp_teams_max_nth) {
7488       int new_threads = __kmp_teams_max_nth / num_teams;
7489       if (!__kmp_reserve_warn) { // user asked for too many threads
7490         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7491         __kmp_msg(kmp_ms_warning,
7492                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7493                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7494       }
7495       num_threads = new_threads;
7496     }
7497   }
7498   thr->th.th_teams_size.nth = num_threads;
7499 }
7500 
7501 // Set the proc_bind var to use in the following parallel region.
7502 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7503   kmp_info_t *thr = __kmp_threads[gtid];
7504   thr->th.th_set_proc_bind = proc_bind;
7505 }
7506 
7507 /* Launch the worker threads into the microtask. */
7508 
7509 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7510   kmp_info_t *this_thr = __kmp_threads[gtid];
7511 
7512 #ifdef KMP_DEBUG
7513   int f;
7514 #endif /* KMP_DEBUG */
7515 
7516   KMP_DEBUG_ASSERT(team);
7517   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7518   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7519   KMP_MB(); /* Flush all pending memory write invalidates.  */
7520 
7521   team->t.t_construct = 0; /* no single directives seen yet */
7522   team->t.t_ordered.dt.t_value =
7523       0; /* thread 0 enters the ordered section first */
7524 
7525   /* Reset the identifiers on the dispatch buffer */
7526   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7527   if (team->t.t_max_nproc > 1) {
7528     int i;
7529     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7530       team->t.t_disp_buffer[i].buffer_index = i;
7531       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7532     }
7533   } else {
7534     team->t.t_disp_buffer[0].buffer_index = 0;
7535     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7536   }
7537 
7538   KMP_MB(); /* Flush all pending memory write invalidates.  */
7539   KMP_ASSERT(this_thr->th.th_team == team);
7540 
7541 #ifdef KMP_DEBUG
7542   for (f = 0; f < team->t.t_nproc; f++) {
7543     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7544                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7545   }
7546 #endif /* KMP_DEBUG */
7547 
7548   /* release the worker threads so they may begin working */
7549   __kmp_fork_barrier(gtid, 0);
7550 }
7551 
7552 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7553   kmp_info_t *this_thr = __kmp_threads[gtid];
7554 
7555   KMP_DEBUG_ASSERT(team);
7556   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7557   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7558   KMP_MB(); /* Flush all pending memory write invalidates.  */
7559 
7560 /* Join barrier after fork */
7561 
7562 #ifdef KMP_DEBUG
7563   if (__kmp_threads[gtid] &&
7564       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7565     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7566                  __kmp_threads[gtid]);
7567     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7568                  "team->t.t_nproc=%d\n",
7569                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7570                  team->t.t_nproc);
7571     __kmp_print_structure();
7572   }
7573   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7574                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7575 #endif /* KMP_DEBUG */
7576 
7577   __kmp_join_barrier(gtid); /* wait for everyone */
7578 #if OMPT_SUPPORT
7579   if (ompt_enabled.enabled &&
7580       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7581     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7582     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7583     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7584 #if OMPT_OPTIONAL
7585     void *codeptr = NULL;
7586     if (KMP_MASTER_TID(ds_tid) &&
7587         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7588          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7589       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7590 
7591     if (ompt_enabled.ompt_callback_sync_region_wait) {
7592       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7593           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7594           codeptr);
7595     }
7596     if (ompt_enabled.ompt_callback_sync_region) {
7597       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7598           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7599           codeptr);
7600     }
7601 #endif
7602     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7603       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7604           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7605     }
7606   }
7607 #endif
7608 
7609   KMP_MB(); /* Flush all pending memory write invalidates.  */
7610   KMP_ASSERT(this_thr->th.th_team == team);
7611 }
7612 
7613 /* ------------------------------------------------------------------------ */
7614 
7615 #ifdef USE_LOAD_BALANCE
7616 
7617 // Return the worker threads actively spinning in the hot team, if we
7618 // are at the outermost level of parallelism.  Otherwise, return 0.
7619 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7620   int i;
7621   int retval;
7622   kmp_team_t *hot_team;
7623 
7624   if (root->r.r_active) {
7625     return 0;
7626   }
7627   hot_team = root->r.r_hot_team;
7628   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7629     return hot_team->t.t_nproc - 1; // Don't count master thread
7630   }
7631 
7632   // Skip the master thread - it is accounted for elsewhere.
7633   retval = 0;
7634   for (i = 1; i < hot_team->t.t_nproc; i++) {
7635     if (hot_team->t.t_threads[i]->th.th_active) {
7636       retval++;
7637     }
7638   }
7639   return retval;
7640 }
7641 
7642 // Perform an automatic adjustment to the number of
7643 // threads used by the next parallel region.
7644 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7645   int retval;
7646   int pool_active;
7647   int hot_team_active;
7648   int team_curr_active;
7649   int system_active;
7650 
7651   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7652                 set_nproc));
7653   KMP_DEBUG_ASSERT(root);
7654   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7655                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7656   KMP_DEBUG_ASSERT(set_nproc > 1);
7657 
7658   if (set_nproc == 1) {
7659     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7660     return 1;
7661   }
7662 
7663   // Threads that are active in the thread pool, active in the hot team for this
7664   // particular root (if we are at the outer par level), and the currently
7665   // executing thread (to become the master) are available to add to the new
7666   // team, but are currently contributing to the system load, and must be
7667   // accounted for.
7668   pool_active = __kmp_thread_pool_active_nth;
7669   hot_team_active = __kmp_active_hot_team_nproc(root);
7670   team_curr_active = pool_active + hot_team_active + 1;
7671 
7672   // Check the system load.
7673   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7674   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7675                 "hot team active = %d\n",
7676                 system_active, pool_active, hot_team_active));
7677 
7678   if (system_active < 0) {
7679     // There was an error reading the necessary info from /proc, so use the
7680     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7681     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7682     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7683     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7684 
7685     // Make this call behave like the thread limit algorithm.
7686     retval = __kmp_avail_proc - __kmp_nth +
7687              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7688     if (retval > set_nproc) {
7689       retval = set_nproc;
7690     }
7691     if (retval < KMP_MIN_NTH) {
7692       retval = KMP_MIN_NTH;
7693     }
7694 
7695     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7696                   retval));
7697     return retval;
7698   }
7699 
7700   // There is a slight delay in the load balance algorithm in detecting new
7701   // running procs. The real system load at this instant should be at least as
7702   // large as the #active omp thread that are available to add to the team.
7703   if (system_active < team_curr_active) {
7704     system_active = team_curr_active;
7705   }
7706   retval = __kmp_avail_proc - system_active + team_curr_active;
7707   if (retval > set_nproc) {
7708     retval = set_nproc;
7709   }
7710   if (retval < KMP_MIN_NTH) {
7711     retval = KMP_MIN_NTH;
7712   }
7713 
7714   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7715   return retval;
7716 } // __kmp_load_balance_nproc()
7717 
7718 #endif /* USE_LOAD_BALANCE */
7719 
7720 /* ------------------------------------------------------------------------ */
7721 
7722 /* NOTE: this is called with the __kmp_init_lock held */
7723 void __kmp_cleanup(void) {
7724   int f;
7725 
7726   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7727 
7728   if (TCR_4(__kmp_init_parallel)) {
7729 #if KMP_HANDLE_SIGNALS
7730     __kmp_remove_signals();
7731 #endif
7732     TCW_4(__kmp_init_parallel, FALSE);
7733   }
7734 
7735   if (TCR_4(__kmp_init_middle)) {
7736 #if KMP_AFFINITY_SUPPORTED
7737     __kmp_affinity_uninitialize();
7738 #endif /* KMP_AFFINITY_SUPPORTED */
7739     __kmp_cleanup_hierarchy();
7740     TCW_4(__kmp_init_middle, FALSE);
7741   }
7742 
7743   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7744 
7745   if (__kmp_init_serial) {
7746     __kmp_runtime_destroy();
7747     __kmp_init_serial = FALSE;
7748   }
7749 
7750   __kmp_cleanup_threadprivate_caches();
7751 
7752   for (f = 0; f < __kmp_threads_capacity; f++) {
7753     if (__kmp_root[f] != NULL) {
7754       __kmp_free(__kmp_root[f]);
7755       __kmp_root[f] = NULL;
7756     }
7757   }
7758   __kmp_free(__kmp_threads);
7759   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7760   // there is no need in freeing __kmp_root.
7761   __kmp_threads = NULL;
7762   __kmp_root = NULL;
7763   __kmp_threads_capacity = 0;
7764 
7765 #if KMP_USE_DYNAMIC_LOCK
7766   __kmp_cleanup_indirect_user_locks();
7767 #else
7768   __kmp_cleanup_user_locks();
7769 #endif
7770 
7771 #if KMP_AFFINITY_SUPPORTED
7772   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7773   __kmp_cpuinfo_file = NULL;
7774 #endif /* KMP_AFFINITY_SUPPORTED */
7775 
7776 #if KMP_USE_ADAPTIVE_LOCKS
7777 #if KMP_DEBUG_ADAPTIVE_LOCKS
7778   __kmp_print_speculative_stats();
7779 #endif
7780 #endif
7781   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7782   __kmp_nested_nth.nth = NULL;
7783   __kmp_nested_nth.size = 0;
7784   __kmp_nested_nth.used = 0;
7785   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7786   __kmp_nested_proc_bind.bind_types = NULL;
7787   __kmp_nested_proc_bind.size = 0;
7788   __kmp_nested_proc_bind.used = 0;
7789   if (__kmp_affinity_format) {
7790     KMP_INTERNAL_FREE(__kmp_affinity_format);
7791     __kmp_affinity_format = NULL;
7792   }
7793 
7794   __kmp_i18n_catclose();
7795 
7796 #if KMP_USE_HIER_SCHED
7797   __kmp_hier_scheds.deallocate();
7798 #endif
7799 
7800 #if KMP_STATS_ENABLED
7801   __kmp_stats_fini();
7802 #endif
7803 
7804   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7805 }
7806 
7807 /* ------------------------------------------------------------------------ */
7808 
7809 int __kmp_ignore_mppbeg(void) {
7810   char *env;
7811 
7812   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7813     if (__kmp_str_match_false(env))
7814       return FALSE;
7815   }
7816   // By default __kmpc_begin() is no-op.
7817   return TRUE;
7818 }
7819 
7820 int __kmp_ignore_mppend(void) {
7821   char *env;
7822 
7823   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7824     if (__kmp_str_match_false(env))
7825       return FALSE;
7826   }
7827   // By default __kmpc_end() is no-op.
7828   return TRUE;
7829 }
7830 
7831 void __kmp_internal_begin(void) {
7832   int gtid;
7833   kmp_root_t *root;
7834 
7835   /* this is a very important step as it will register new sibling threads
7836      and assign these new uber threads a new gtid */
7837   gtid = __kmp_entry_gtid();
7838   root = __kmp_threads[gtid]->th.th_root;
7839   KMP_ASSERT(KMP_UBER_GTID(gtid));
7840 
7841   if (root->r.r_begin)
7842     return;
7843   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7844   if (root->r.r_begin) {
7845     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7846     return;
7847   }
7848 
7849   root->r.r_begin = TRUE;
7850 
7851   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7852 }
7853 
7854 /* ------------------------------------------------------------------------ */
7855 
7856 void __kmp_user_set_library(enum library_type arg) {
7857   int gtid;
7858   kmp_root_t *root;
7859   kmp_info_t *thread;
7860 
7861   /* first, make sure we are initialized so we can get our gtid */
7862 
7863   gtid = __kmp_entry_gtid();
7864   thread = __kmp_threads[gtid];
7865 
7866   root = thread->th.th_root;
7867 
7868   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7869                 library_serial));
7870   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7871                                   thread */
7872     KMP_WARNING(SetLibraryIncorrectCall);
7873     return;
7874   }
7875 
7876   switch (arg) {
7877   case library_serial:
7878     thread->th.th_set_nproc = 0;
7879     set__nproc(thread, 1);
7880     break;
7881   case library_turnaround:
7882     thread->th.th_set_nproc = 0;
7883     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7884                                            : __kmp_dflt_team_nth_ub);
7885     break;
7886   case library_throughput:
7887     thread->th.th_set_nproc = 0;
7888     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7889                                            : __kmp_dflt_team_nth_ub);
7890     break;
7891   default:
7892     KMP_FATAL(UnknownLibraryType, arg);
7893   }
7894 
7895   __kmp_aux_set_library(arg);
7896 }
7897 
7898 void __kmp_aux_set_stacksize(size_t arg) {
7899   if (!__kmp_init_serial)
7900     __kmp_serial_initialize();
7901 
7902 #if KMP_OS_DARWIN
7903   if (arg & (0x1000 - 1)) {
7904     arg &= ~(0x1000 - 1);
7905     if (arg + 0x1000) /* check for overflow if we round up */
7906       arg += 0x1000;
7907   }
7908 #endif
7909   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7910 
7911   /* only change the default stacksize before the first parallel region */
7912   if (!TCR_4(__kmp_init_parallel)) {
7913     size_t value = arg; /* argument is in bytes */
7914 
7915     if (value < __kmp_sys_min_stksize)
7916       value = __kmp_sys_min_stksize;
7917     else if (value > KMP_MAX_STKSIZE)
7918       value = KMP_MAX_STKSIZE;
7919 
7920     __kmp_stksize = value;
7921 
7922     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7923   }
7924 
7925   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7926 }
7927 
7928 /* set the behaviour of the runtime library */
7929 /* TODO this can cause some odd behaviour with sibling parallelism... */
7930 void __kmp_aux_set_library(enum library_type arg) {
7931   __kmp_library = arg;
7932 
7933   switch (__kmp_library) {
7934   case library_serial: {
7935     KMP_INFORM(LibraryIsSerial);
7936   } break;
7937   case library_turnaround:
7938     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7939       __kmp_use_yield = 2; // only yield when oversubscribed
7940     break;
7941   case library_throughput:
7942     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7943       __kmp_dflt_blocktime = 200;
7944     break;
7945   default:
7946     KMP_FATAL(UnknownLibraryType, arg);
7947   }
7948 }
7949 
7950 /* Getting team information common for all team API */
7951 // Returns NULL if not in teams construct
7952 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7953   kmp_info_t *thr = __kmp_entry_thread();
7954   teams_serialized = 0;
7955   if (thr->th.th_teams_microtask) {
7956     kmp_team_t *team = thr->th.th_team;
7957     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7958     int ii = team->t.t_level;
7959     teams_serialized = team->t.t_serialized;
7960     int level = tlevel + 1;
7961     KMP_DEBUG_ASSERT(ii >= tlevel);
7962     while (ii > level) {
7963       for (teams_serialized = team->t.t_serialized;
7964            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7965       }
7966       if (team->t.t_serialized && (!teams_serialized)) {
7967         team = team->t.t_parent;
7968         continue;
7969       }
7970       if (ii > level) {
7971         team = team->t.t_parent;
7972         ii--;
7973       }
7974     }
7975     return team;
7976   }
7977   return NULL;
7978 }
7979 
7980 int __kmp_aux_get_team_num() {
7981   int serialized;
7982   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7983   if (team) {
7984     if (serialized > 1) {
7985       return 0; // teams region is serialized ( 1 team of 1 thread ).
7986     } else {
7987       return team->t.t_master_tid;
7988     }
7989   }
7990   return 0;
7991 }
7992 
7993 int __kmp_aux_get_num_teams() {
7994   int serialized;
7995   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7996   if (team) {
7997     if (serialized > 1) {
7998       return 1;
7999     } else {
8000       return team->t.t_parent->t.t_nproc;
8001     }
8002   }
8003   return 1;
8004 }
8005 
8006 /* ------------------------------------------------------------------------ */
8007 
8008 /*
8009  * Affinity Format Parser
8010  *
8011  * Field is in form of: %[[[0].]size]type
8012  * % and type are required (%% means print a literal '%')
8013  * type is either single char or long name surrounded by {},
8014  * e.g., N or {num_threads}
8015  * 0 => leading zeros
8016  * . => right justified when size is specified
8017  * by default output is left justified
8018  * size is the *minimum* field length
8019  * All other characters are printed as is
8020  *
8021  * Available field types:
8022  * L {thread_level}      - omp_get_level()
8023  * n {thread_num}        - omp_get_thread_num()
8024  * h {host}              - name of host machine
8025  * P {process_id}        - process id (integer)
8026  * T {thread_identifier} - native thread identifier (integer)
8027  * N {num_threads}       - omp_get_num_threads()
8028  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8029  * a {thread_affinity}   - comma separated list of integers or integer ranges
8030  *                         (values of affinity mask)
8031  *
8032  * Implementation-specific field types can be added
8033  * If a type is unknown, print "undefined"
8034 */
8035 
8036 // Structure holding the short name, long name, and corresponding data type
8037 // for snprintf.  A table of these will represent the entire valid keyword
8038 // field types.
8039 typedef struct kmp_affinity_format_field_t {
8040   char short_name; // from spec e.g., L -> thread level
8041   const char *long_name; // from spec thread_level -> thread level
8042   char field_format; // data type for snprintf (typically 'd' or 's'
8043   // for integer or string)
8044 } kmp_affinity_format_field_t;
8045 
8046 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8047 #if KMP_AFFINITY_SUPPORTED
8048     {'A', "thread_affinity", 's'},
8049 #endif
8050     {'t', "team_num", 'd'},
8051     {'T', "num_teams", 'd'},
8052     {'L', "nesting_level", 'd'},
8053     {'n', "thread_num", 'd'},
8054     {'N', "num_threads", 'd'},
8055     {'a', "ancestor_tnum", 'd'},
8056     {'H', "host", 's'},
8057     {'P', "process_id", 'd'},
8058     {'i', "native_thread_id", 'd'}};
8059 
8060 // Return the number of characters it takes to hold field
8061 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8062                                             const char **ptr,
8063                                             kmp_str_buf_t *field_buffer) {
8064   int rc, format_index, field_value;
8065   const char *width_left, *width_right;
8066   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8067   static const int FORMAT_SIZE = 20;
8068   char format[FORMAT_SIZE] = {0};
8069   char absolute_short_name = 0;
8070 
8071   KMP_DEBUG_ASSERT(gtid >= 0);
8072   KMP_DEBUG_ASSERT(th);
8073   KMP_DEBUG_ASSERT(**ptr == '%');
8074   KMP_DEBUG_ASSERT(field_buffer);
8075 
8076   __kmp_str_buf_clear(field_buffer);
8077 
8078   // Skip the initial %
8079   (*ptr)++;
8080 
8081   // Check for %% first
8082   if (**ptr == '%') {
8083     __kmp_str_buf_cat(field_buffer, "%", 1);
8084     (*ptr)++; // skip over the second %
8085     return 1;
8086   }
8087 
8088   // Parse field modifiers if they are present
8089   pad_zeros = false;
8090   if (**ptr == '0') {
8091     pad_zeros = true;
8092     (*ptr)++; // skip over 0
8093   }
8094   right_justify = false;
8095   if (**ptr == '.') {
8096     right_justify = true;
8097     (*ptr)++; // skip over .
8098   }
8099   // Parse width of field: [width_left, width_right)
8100   width_left = width_right = NULL;
8101   if (**ptr >= '0' && **ptr <= '9') {
8102     width_left = *ptr;
8103     SKIP_DIGITS(*ptr);
8104     width_right = *ptr;
8105   }
8106 
8107   // Create the format for KMP_SNPRINTF based on flags parsed above
8108   format_index = 0;
8109   format[format_index++] = '%';
8110   if (!right_justify)
8111     format[format_index++] = '-';
8112   if (pad_zeros)
8113     format[format_index++] = '0';
8114   if (width_left && width_right) {
8115     int i = 0;
8116     // Only allow 8 digit number widths.
8117     // This also prevents overflowing format variable
8118     while (i < 8 && width_left < width_right) {
8119       format[format_index++] = *width_left;
8120       width_left++;
8121       i++;
8122     }
8123   }
8124 
8125   // Parse a name (long or short)
8126   // Canonicalize the name into absolute_short_name
8127   found_valid_name = false;
8128   parse_long_name = (**ptr == '{');
8129   if (parse_long_name)
8130     (*ptr)++; // skip initial left brace
8131   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8132                              sizeof(__kmp_affinity_format_table[0]);
8133        ++i) {
8134     char short_name = __kmp_affinity_format_table[i].short_name;
8135     const char *long_name = __kmp_affinity_format_table[i].long_name;
8136     char field_format = __kmp_affinity_format_table[i].field_format;
8137     if (parse_long_name) {
8138       size_t length = KMP_STRLEN(long_name);
8139       if (strncmp(*ptr, long_name, length) == 0) {
8140         found_valid_name = true;
8141         (*ptr) += length; // skip the long name
8142       }
8143     } else if (**ptr == short_name) {
8144       found_valid_name = true;
8145       (*ptr)++; // skip the short name
8146     }
8147     if (found_valid_name) {
8148       format[format_index++] = field_format;
8149       format[format_index++] = '\0';
8150       absolute_short_name = short_name;
8151       break;
8152     }
8153   }
8154   if (parse_long_name) {
8155     if (**ptr != '}') {
8156       absolute_short_name = 0;
8157     } else {
8158       (*ptr)++; // skip over the right brace
8159     }
8160   }
8161 
8162   // Attempt to fill the buffer with the requested
8163   // value using snprintf within __kmp_str_buf_print()
8164   switch (absolute_short_name) {
8165   case 't':
8166     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8167     break;
8168   case 'T':
8169     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8170     break;
8171   case 'L':
8172     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8173     break;
8174   case 'n':
8175     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8176     break;
8177   case 'H': {
8178     static const int BUFFER_SIZE = 256;
8179     char buf[BUFFER_SIZE];
8180     __kmp_expand_host_name(buf, BUFFER_SIZE);
8181     rc = __kmp_str_buf_print(field_buffer, format, buf);
8182   } break;
8183   case 'P':
8184     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8185     break;
8186   case 'i':
8187     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8188     break;
8189   case 'N':
8190     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8191     break;
8192   case 'a':
8193     field_value =
8194         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8195     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8196     break;
8197 #if KMP_AFFINITY_SUPPORTED
8198   case 'A': {
8199     kmp_str_buf_t buf;
8200     __kmp_str_buf_init(&buf);
8201     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8202     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8203     __kmp_str_buf_free(&buf);
8204   } break;
8205 #endif
8206   default:
8207     // According to spec, If an implementation does not have info for field
8208     // type, then "undefined" is printed
8209     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8210     // Skip the field
8211     if (parse_long_name) {
8212       SKIP_TOKEN(*ptr);
8213       if (**ptr == '}')
8214         (*ptr)++;
8215     } else {
8216       (*ptr)++;
8217     }
8218   }
8219 
8220   KMP_ASSERT(format_index <= FORMAT_SIZE);
8221   return rc;
8222 }
8223 
8224 /*
8225  * Return number of characters needed to hold the affinity string
8226  * (not including null byte character)
8227  * The resultant string is printed to buffer, which the caller can then
8228  * handle afterwards
8229 */
8230 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8231                                   kmp_str_buf_t *buffer) {
8232   const char *parse_ptr;
8233   size_t retval;
8234   const kmp_info_t *th;
8235   kmp_str_buf_t field;
8236 
8237   KMP_DEBUG_ASSERT(buffer);
8238   KMP_DEBUG_ASSERT(gtid >= 0);
8239 
8240   __kmp_str_buf_init(&field);
8241   __kmp_str_buf_clear(buffer);
8242 
8243   th = __kmp_threads[gtid];
8244   retval = 0;
8245 
8246   // If format is NULL or zero-length string, then we use
8247   // affinity-format-var ICV
8248   parse_ptr = format;
8249   if (parse_ptr == NULL || *parse_ptr == '\0') {
8250     parse_ptr = __kmp_affinity_format;
8251   }
8252   KMP_DEBUG_ASSERT(parse_ptr);
8253 
8254   while (*parse_ptr != '\0') {
8255     // Parse a field
8256     if (*parse_ptr == '%') {
8257       // Put field in the buffer
8258       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8259       __kmp_str_buf_catbuf(buffer, &field);
8260       retval += rc;
8261     } else {
8262       // Put literal character in buffer
8263       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8264       retval++;
8265       parse_ptr++;
8266     }
8267   }
8268   __kmp_str_buf_free(&field);
8269   return retval;
8270 }
8271 
8272 // Displays the affinity string to stdout
8273 void __kmp_aux_display_affinity(int gtid, const char *format) {
8274   kmp_str_buf_t buf;
8275   __kmp_str_buf_init(&buf);
8276   __kmp_aux_capture_affinity(gtid, format, &buf);
8277   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8278   __kmp_str_buf_free(&buf);
8279 }
8280 
8281 /* ------------------------------------------------------------------------ */
8282 
8283 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8284   int blocktime = arg; /* argument is in milliseconds */
8285 #if KMP_USE_MONITOR
8286   int bt_intervals;
8287 #endif
8288   kmp_int8 bt_set;
8289 
8290   __kmp_save_internal_controls(thread);
8291 
8292   /* Normalize and set blocktime for the teams */
8293   if (blocktime < KMP_MIN_BLOCKTIME)
8294     blocktime = KMP_MIN_BLOCKTIME;
8295   else if (blocktime > KMP_MAX_BLOCKTIME)
8296     blocktime = KMP_MAX_BLOCKTIME;
8297 
8298   set__blocktime_team(thread->th.th_team, tid, blocktime);
8299   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8300 
8301 #if KMP_USE_MONITOR
8302   /* Calculate and set blocktime intervals for the teams */
8303   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8304 
8305   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8306   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8307 #endif
8308 
8309   /* Set whether blocktime has been set to "TRUE" */
8310   bt_set = TRUE;
8311 
8312   set__bt_set_team(thread->th.th_team, tid, bt_set);
8313   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8314 #if KMP_USE_MONITOR
8315   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8316                 "bt_intervals=%d, monitor_updates=%d\n",
8317                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8318                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8319                 __kmp_monitor_wakeups));
8320 #else
8321   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8322                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8323                 thread->th.th_team->t.t_id, tid, blocktime));
8324 #endif
8325 }
8326 
8327 void __kmp_aux_set_defaults(char const *str, size_t len) {
8328   if (!__kmp_init_serial) {
8329     __kmp_serial_initialize();
8330   }
8331   __kmp_env_initialize(str);
8332 
8333   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8334     __kmp_env_print();
8335   }
8336 } // __kmp_aux_set_defaults
8337 
8338 /* ------------------------------------------------------------------------ */
8339 /* internal fast reduction routines */
8340 
8341 PACKED_REDUCTION_METHOD_T
8342 __kmp_determine_reduction_method(
8343     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8344     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8345     kmp_critical_name *lck) {
8346 
8347   // Default reduction method: critical construct ( lck != NULL, like in current
8348   // PAROPT )
8349   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8350   // can be selected by RTL
8351   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8352   // can be selected by RTL
8353   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8354   // among generated by PAROPT.
8355 
8356   PACKED_REDUCTION_METHOD_T retval;
8357 
8358   int team_size;
8359 
8360   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8361   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8362 
8363 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8364   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8365 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8366 
8367   retval = critical_reduce_block;
8368 
8369   // another choice of getting a team size (with 1 dynamic deference) is slower
8370   team_size = __kmp_get_team_num_threads(global_tid);
8371   if (team_size == 1) {
8372 
8373     retval = empty_reduce_block;
8374 
8375   } else {
8376 
8377     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8378 
8379 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8380     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8381 
8382 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8383     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8384 
8385     int teamsize_cutoff = 4;
8386 
8387 #if KMP_MIC_SUPPORTED
8388     if (__kmp_mic_type != non_mic) {
8389       teamsize_cutoff = 8;
8390     }
8391 #endif
8392     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8393     if (tree_available) {
8394       if (team_size <= teamsize_cutoff) {
8395         if (atomic_available) {
8396           retval = atomic_reduce_block;
8397         }
8398       } else {
8399         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8400       }
8401     } else if (atomic_available) {
8402       retval = atomic_reduce_block;
8403     }
8404 #else
8405 #error "Unknown or unsupported OS"
8406 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8407        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8408 
8409 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8410 
8411 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8412 
8413     // basic tuning
8414 
8415     if (atomic_available) {
8416       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8417         retval = atomic_reduce_block;
8418       }
8419     } // otherwise: use critical section
8420 
8421 #elif KMP_OS_DARWIN
8422 
8423     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8424     if (atomic_available && (num_vars <= 3)) {
8425       retval = atomic_reduce_block;
8426     } else if (tree_available) {
8427       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8428           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8429         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8430       }
8431     } // otherwise: use critical section
8432 
8433 #else
8434 #error "Unknown or unsupported OS"
8435 #endif
8436 
8437 #else
8438 #error "Unknown or unsupported architecture"
8439 #endif
8440   }
8441 
8442   // KMP_FORCE_REDUCTION
8443 
8444   // If the team is serialized (team_size == 1), ignore the forced reduction
8445   // method and stay with the unsynchronized method (empty_reduce_block)
8446   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8447       team_size != 1) {
8448 
8449     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8450 
8451     int atomic_available, tree_available;
8452 
8453     switch ((forced_retval = __kmp_force_reduction_method)) {
8454     case critical_reduce_block:
8455       KMP_ASSERT(lck); // lck should be != 0
8456       break;
8457 
8458     case atomic_reduce_block:
8459       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8460       if (!atomic_available) {
8461         KMP_WARNING(RedMethodNotSupported, "atomic");
8462         forced_retval = critical_reduce_block;
8463       }
8464       break;
8465 
8466     case tree_reduce_block:
8467       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8468       if (!tree_available) {
8469         KMP_WARNING(RedMethodNotSupported, "tree");
8470         forced_retval = critical_reduce_block;
8471       } else {
8472 #if KMP_FAST_REDUCTION_BARRIER
8473         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8474 #endif
8475       }
8476       break;
8477 
8478     default:
8479       KMP_ASSERT(0); // "unsupported method specified"
8480     }
8481 
8482     retval = forced_retval;
8483   }
8484 
8485   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8486 
8487 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8488 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8489 
8490   return (retval);
8491 }
8492 // this function is for testing set/get/determine reduce method
8493 kmp_int32 __kmp_get_reduce_method(void) {
8494   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8495 }
8496 
8497 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8498 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8499 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8500 
8501 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8502 // OpenMP is used subsequently.
8503 void __kmp_hard_pause() {
8504   __kmp_pause_status = kmp_hard_paused;
8505   __kmp_internal_end_thread(-1);
8506 }
8507 
8508 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8509 void __kmp_resume_if_soft_paused() {
8510   if (__kmp_pause_status == kmp_soft_paused) {
8511     __kmp_pause_status = kmp_not_paused;
8512 
8513     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8514       kmp_info_t *thread = __kmp_threads[gtid];
8515       if (thread) { // Wake it if sleeping
8516         kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8517                          thread);
8518         if (fl.is_sleeping())
8519           fl.resume(gtid);
8520         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8521           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8522         } else { // thread holds the lock and may sleep soon
8523           do { // until either the thread sleeps, or we can get the lock
8524             if (fl.is_sleeping()) {
8525               fl.resume(gtid);
8526               break;
8527             } else if (__kmp_try_suspend_mx(thread)) {
8528               __kmp_unlock_suspend_mx(thread);
8529               break;
8530             }
8531           } while (1);
8532         }
8533       }
8534     }
8535   }
8536 }
8537 
8538 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8539 // TODO: add warning messages
8540 int __kmp_pause_resource(kmp_pause_status_t level) {
8541   if (level == kmp_not_paused) { // requesting resume
8542     if (__kmp_pause_status == kmp_not_paused) {
8543       // error message about runtime not being paused, so can't resume
8544       return 1;
8545     } else {
8546       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8547                        __kmp_pause_status == kmp_hard_paused);
8548       __kmp_pause_status = kmp_not_paused;
8549       return 0;
8550     }
8551   } else if (level == kmp_soft_paused) { // requesting soft pause
8552     if (__kmp_pause_status != kmp_not_paused) {
8553       // error message about already being paused
8554       return 1;
8555     } else {
8556       __kmp_soft_pause();
8557       return 0;
8558     }
8559   } else if (level == kmp_hard_paused) { // requesting hard pause
8560     if (__kmp_pause_status != kmp_not_paused) {
8561       // error message about already being paused
8562       return 1;
8563     } else {
8564       __kmp_hard_pause();
8565       return 0;
8566     }
8567   } else {
8568     // error message about invalid level
8569     return 1;
8570   }
8571 }
8572 
8573 void __kmp_omp_display_env(int verbose) {
8574   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8575   if (__kmp_init_serial == 0)
8576     __kmp_do_serial_initialize();
8577   __kmp_display_env_impl(!verbose, verbose);
8578   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8579 }
8580 
8581 // Globals and functions for hidden helper task
8582 kmp_info_t **__kmp_hidden_helper_threads;
8583 kmp_info_t *__kmp_hidden_helper_main_thread;
8584 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8585 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8586 #if KMP_OS_LINUX
8587 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8588 #else
8589 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8590 #endif
8591 
8592 namespace {
8593 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8594 
8595 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8596   // This is an explicit synchronization on all hidden helper threads in case
8597   // that when a regular thread pushes a hidden helper task to one hidden
8598   // helper thread, the thread has not been awaken once since they're released
8599   // by the main thread after creating the team.
8600   KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8601   while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8602          __kmp_hidden_helper_threads_num)
8603     ;
8604 
8605   // If main thread, then wait for signal
8606   if (__kmpc_master(nullptr, *gtid)) {
8607     // First, unset the initial state and release the initial thread
8608     TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8609     __kmp_hidden_helper_initz_release();
8610     __kmp_hidden_helper_main_thread_wait();
8611     // Now wake up all worker threads
8612     for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8613       __kmp_hidden_helper_worker_thread_signal();
8614     }
8615   }
8616 }
8617 } // namespace
8618 
8619 void __kmp_hidden_helper_threads_initz_routine() {
8620   // Create a new root for hidden helper team/threads
8621   const int gtid = __kmp_register_root(TRUE);
8622   __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8623   __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8624   __kmp_hidden_helper_main_thread->th.th_set_nproc =
8625       __kmp_hidden_helper_threads_num;
8626 
8627   KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8628 
8629   __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8630 
8631   // Set the initialization flag to FALSE
8632   TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8633 
8634   __kmp_hidden_helper_threads_deinitz_release();
8635 }
8636