1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109   int i;
110   kmp_info_t **other_threads;
111   size_t stack_data;
112   char *stack_addr;
113   size_t stack_size;
114   char *stack_base;
115 
116   KA_TRACE(
117       1000,
118       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
119        __kmp_nth, __kmp_all_nth));
120 
121   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124      __kmp_init_gtid for this to work. */
125 
126   if (!TCR_4(__kmp_init_gtid))
127     return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130   if (TCR_4(__kmp_gtid_mode) >= 3) {
131     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132     return __kmp_gtid;
133   }
134 #endif
135   if (TCR_4(__kmp_gtid_mode) >= 2) {
136     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137     return __kmp_gtid_get_specific();
138   }
139   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141   stack_addr = (char *)&stack_data;
142   other_threads = __kmp_threads;
143 
144   /* ATT: The code below is a source of potential bugs due to unsynchronized
145      access to __kmp_threads array. For example:
146      1. Current thread loads other_threads[i] to thr and checks it, it is
147         non-NULL.
148      2. Current thread is suspended by OS.
149      3. Another thread unregisters and finishes (debug versions of free()
150         may fill memory with something like 0xEF).
151      4. Current thread is resumed.
152      5. Current thread reads junk from *thr.
153      TODO: Fix it.  --ln  */
154 
155   for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158     if (!thr)
159       continue;
160 
161     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164     /* stack grows down -- search through all of the active threads */
165 
166     if (stack_addr <= stack_base) {
167       size_t stack_diff = stack_base - stack_addr;
168 
169       if (stack_diff <= stack_size) {
170         /* The only way we can be closer than the allocated */
171         /* stack size is if we are running on this thread. */
172         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173         return i;
174       }
175     }
176   }
177 
178   /* get specific to try and determine our gtid */
179   KA_TRACE(1000,
180            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181             "thread, using TLS\n"));
182   i = __kmp_gtid_get_specific();
183 
184   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
185 
186   /* if we havn't been assigned a gtid, then return code */
187   if (i < 0)
188     return i;
189 
190   /* dynamically updated stack window for uber threads to avoid get_specific
191      call */
192   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193     KMP_FATAL(StackOverflow, i);
194   }
195 
196   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197   if (stack_addr > stack_base) {
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201                 stack_base);
202   } else {
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             stack_base - stack_addr);
205   }
206 
207   /* Reprint stack bounds for ubermaster since they have been refined */
208   if (__kmp_storage_map) {
209     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212                                  other_threads[i]->th.th_info.ds.ds_stacksize,
213                                  "th_%d stack (refinement)", i);
214   }
215   return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219   int gtid;
220 
221   if (!__kmp_init_serial) {
222     gtid = KMP_GTID_DNE;
223   } else
224 #ifdef KMP_TDATA_GTID
225       if (TCR_4(__kmp_gtid_mode) >= 3) {
226     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227     gtid = __kmp_gtid;
228   } else
229 #endif
230       if (TCR_4(__kmp_gtid_mode) >= 2) {
231     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232     gtid = __kmp_gtid_get_specific();
233   } else {
234     KA_TRACE(1000,
235              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236     gtid = __kmp_get_global_thread_id();
237   }
238 
239   /* we must be a new uber master sibling thread */
240   if (gtid == KMP_GTID_DNE) {
241     KA_TRACE(10,
242              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243               "Registering a new gtid.\n"));
244     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245     if (!__kmp_init_serial) {
246       __kmp_do_serial_initialize();
247       gtid = __kmp_gtid_get_specific();
248     } else {
249       gtid = __kmp_register_root(FALSE);
250     }
251     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253   }
254 
255   KMP_DEBUG_ASSERT(gtid >= 0);
256 
257   return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262   int f;
263   char *stack_beg = NULL;
264   char *stack_end = NULL;
265   int gtid;
266 
267   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268   if (__kmp_storage_map) {
269     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272     gtid = __kmp_gtid_from_thread(th);
273 
274     if (gtid == KMP_GTID_MONITOR) {
275       __kmp_print_storage_map_gtid(
276           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277           "th_%s stack (%s)", "mon",
278           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279     } else {
280       __kmp_print_storage_map_gtid(
281           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282           "th_%d stack (%s)", gtid,
283           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284     }
285   }
286 
287   /* No point in checking ubermaster threads since they use refinement and
288    * cannot overlap */
289   gtid = __kmp_gtid_from_thread(th);
290   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291     KA_TRACE(10,
292              ("__kmp_check_stack_overlap: performing extensive checking\n"));
293     if (stack_beg == NULL) {
294       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296     }
297 
298     for (f = 0; f < __kmp_threads_capacity; f++) {
299       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301       if (f_th && f_th != th) {
302         char *other_stack_end =
303             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304         char *other_stack_beg =
305             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309           /* Print the other stack values before the abort */
310           if (__kmp_storage_map)
311             __kmp_print_storage_map_gtid(
312                 -1, other_stack_beg, other_stack_end,
313                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317                       __kmp_msg_null);
318         }
319       }
320     }
321   }
322   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328   static int done = FALSE;
329 
330   while (!done) {
331     KMP_YIELD(TRUE);
332   }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338                                   char const *format, ...) {
339   char buffer[MAX_MESSAGE];
340   va_list ap;
341 
342   va_start(ap, format);
343   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344                p2, (unsigned long)size, format);
345   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346   __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348   int node;
349   if (gtid >= 0) {
350     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351       if (__kmp_storage_map_verbose) {
352         node = __kmp_get_host_node(p1);
353         if (node < 0) /* doesn't work, so don't try this next time */
354           __kmp_storage_map_verbose = FALSE;
355         else {
356           char *last;
357           int lastNode;
358           int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360           const int page_size = KMP_GET_PAGE_SIZE();
361 
362           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364           if (localProc >= 0)
365             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
366                                  localProc >> 1);
367           else
368             __kmp_printf_no_lock("  GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370           /* The more elaborate format is disabled for now because of the prctl
371            * hanging bug. */
372           do {
373             last = p1;
374             lastNode = node;
375             /* This loop collates adjacent pages with the same host node. */
376             do {
377               (char *)p1 += page_size;
378             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
380                                  lastNode);
381           } while (p1 <= p2);
382 #else
383           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
384                                (char *)p1 + (page_size - 1),
385                                __kmp_get_host_node(p1));
386           if (p1 < p2) {
387             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
388                                  (char *)p2 + (page_size - 1),
389                                  __kmp_get_host_node(p2));
390           }
391 #endif
392         }
393       }
394     } else
395       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
396   }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402   char buffer[MAX_MESSAGE];
403   va_list ap;
404 
405   if (__kmp_generate_warnings == kmp_warnings_off) {
406     return;
407   }
408 
409   va_start(ap, format);
410 
411   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413   __kmp_vprintf(kmp_err, buffer, ap);
414   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416   va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420   // Later threads may stall here, but that's ok because abort() will kill them.
421   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423   if (__kmp_debug_buf) {
424     __kmp_dump_debug_buffer();
425   }
426 
427   if (KMP_OS_WINDOWS) {
428     // Let other threads know of abnormal termination and prevent deadlock
429     // if abort happened during library initialization or shutdown
430     __kmp_global.g.g_abort = SIGABRT;
431 
432     /* On Windows* OS by default abort() causes pop-up error box, which stalls
433        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434        boxes. _set_abort_behavior() works well, but this function is not
435        available in VS7 (this is not problem for DLL, but it is a problem for
436        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437        help, at least in some versions of MS C RTL.
438 
439        It seems following sequence is the only way to simulate abort() and
440        avoid pop-up error box. */
441     raise(SIGABRT);
442     _exit(3); // Just in case, if signal ignored, exit anyway.
443   } else {
444     abort();
445   }
446 
447   __kmp_infinite_loop();
448   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453   // TODO: Eliminate g_abort global variable and this function.
454   // In case of abort just call abort(), it will kill all the threads.
455   __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459    that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                                gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471   __kmp_print_storage_map_gtid(
472       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                                &thr->th.th_bar[bs_plain_barrier + 1],
477                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                                gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                                gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                                &thr->th.th_bar[bs_reduction_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                                gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494    that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                          int team_id, int num_thr) {
498   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                                header, team_id);
501 
502   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                                &team->t.t_bar[bs_last_barrier],
504                                sizeof(kmp_balign_team_t) * bs_last_barrier,
505                                "%s_%d.t_bar", header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                                &team->t.t_bar[bs_plain_barrier + 1],
509                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                                &team->t.t_bar[bs_forkjoin_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                                &team->t.t_bar[bs_reduction_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528   __kmp_print_storage_map_gtid(
529       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                                &team->t.t_disp_buffer[num_disp_buff],
534                                sizeof(dispatch_shared_info_t) * num_disp_buff,
535                                "%s_%d.t_disp_buffer", header, team_id);
536 }
537 
538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547   // TODO: Change to __kmp_break_bootstrap_lock().
548   __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552   int i;
553   int thread_count;
554 
555   // PROCESS_DETACH is expected to be called by a thread that executes
556   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559   // threads can be still alive here, although being about to be terminated. The
560   // threads in the array with ds_thread==0 are most suspicious. Actually, it
561   // can be not safe to access the __kmp_threads[].
562 
563   // TODO: does it make sense to check __kmp_roots[] ?
564 
565   // Let's check that there are no other alive threads registered with the OMP
566   // lib.
567   while (1) {
568     thread_count = 0;
569     for (i = 0; i < __kmp_threads_capacity; ++i) {
570       if (!__kmp_threads)
571         continue;
572       kmp_info_t *th = __kmp_threads[i];
573       if (th == NULL)
574         continue;
575       int gtid = th->th.th_info.ds.ds_gtid;
576       if (gtid == gtid_req)
577         continue;
578       if (gtid < 0)
579         continue;
580       DWORD exit_val;
581       int alive = __kmp_is_thread_alive(th, &exit_val);
582       if (alive) {
583         ++thread_count;
584       }
585     }
586     if (thread_count == 0)
587       break; // success
588   }
589 
590   // Assume that I'm alone. Now it might be safe to check and reset locks.
591   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592   __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594   __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601   switch (fdwReason) {
602 
603   case DLL_PROCESS_ATTACH:
604     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606     return TRUE;
607 
608   case DLL_PROCESS_DETACH:
609     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611     if (lpReserved != NULL) {
612       // lpReserved is used for telling the difference:
613       //   lpReserved == NULL when FreeLibrary() was called,
614       //   lpReserved != NULL when the process terminates.
615       // When FreeLibrary() is called, worker threads remain alive. So they will
616       // release the forkjoin lock by themselves. When the process terminates,
617       // worker threads disappear triggering the problem of unreleased forkjoin
618       // lock as described below.
619 
620       // A worker thread can take the forkjoin lock. The problem comes up if
621       // that worker thread becomes dead before it releases the forkjoin lock.
622       // The forkjoin lock remains taken, while the thread executing
623       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624       // to take the forkjoin lock and will always fail, so that the application
625       // will never finish [normally]. This scenario is possible if
626       // __kmpc_end() has not been executed. It looks like it's not a corner
627       // case, but common cases:
628       // - the main function was compiled by an alternative compiler;
629       // - the main function was compiled by icl but without /Qopenmp
630       //   (application with plugins);
631       // - application terminates by calling C exit(), Fortran CALL EXIT() or
632       //   Fortran STOP.
633       // - alive foreign thread prevented __kmpc_end from doing cleanup.
634       //
635       // This is a hack to work around the problem.
636       // TODO: !!! figure out something better.
637       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638     }
639 
640     __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642     return TRUE;
643 
644   case DLL_THREAD_ATTACH:
645     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647     /* if we want to register new siblings all the time here call
648      * __kmp_get_gtid(); */
649     return TRUE;
650 
651   case DLL_THREAD_DETACH:
652     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654     __kmp_internal_end_thread(__kmp_gtid_get_specific());
655     return TRUE;
656   }
657 
658   return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666   int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668   kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670 
671   if (__kmp_env_consistency_check) {
672     if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678   }
679 #ifdef BUILD_PARALLEL_ORDERED
680   if (!team->t.t_serialized) {
681     KMP_MB();
682     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683              NULL);
684     KMP_MB();
685   }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688 
689 /* __kmp_parallel_dxo -- Signal the next task. */
690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691   int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693   int tid = __kmp_tid_from_gtid(gtid);
694   kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697   if (__kmp_env_consistency_check) {
698     if (__kmp_threads[gtid]->th.th_root->r.r_active)
699       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700   }
701 #ifdef BUILD_PARALLEL_ORDERED
702   if (!team->t.t_serialized) {
703     KMP_MB(); /* Flush all pending memory write invalidates.  */
704 
705     /* use the tid of the next thread in this team */
706     /* TODO replace with general release procedure */
707     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708 
709     KMP_MB(); /* Flush all pending memory write invalidates.  */
710   }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit   */
716 
717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718   int status;
719   kmp_info_t *th;
720   kmp_team_t *team;
721 
722   if (!TCR_4(__kmp_init_parallel))
723     __kmp_parallel_initialize();
724   __kmp_resume_if_soft_paused();
725 
726   th = __kmp_threads[gtid];
727   team = th->th.th_team;
728   status = 0;
729 
730   th->th.th_ident = id_ref;
731 
732   if (team->t.t_serialized) {
733     status = 1;
734   } else {
735     kmp_int32 old_this = th->th.th_local.this_construct;
736 
737     ++th->th.th_local.this_construct;
738     /* try to set team count to thread count--success means thread got the
739        single block */
740     /* TODO: Should this be acquire or release? */
741     if (team->t.t_construct == old_this) {
742       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743                                               th->th.th_local.this_construct);
744     }
745 #if USE_ITT_BUILD
746     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748         team->t.t_active_level ==
749             1) { // Only report metadata by master of active team at level 1
750       __kmp_itt_metadata_single(id_ref);
751     }
752 #endif /* USE_ITT_BUILD */
753   }
754 
755   if (__kmp_env_consistency_check) {
756     if (status && push_ws) {
757       __kmp_push_workshare(gtid, ct_psingle, id_ref);
758     } else {
759       __kmp_check_workshare(gtid, ct_psingle, id_ref);
760     }
761   }
762 #if USE_ITT_BUILD
763   if (status) {
764     __kmp_itt_single_start(gtid);
765   }
766 #endif /* USE_ITT_BUILD */
767   return status;
768 }
769 
770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772   __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774   if (__kmp_env_consistency_check)
775     __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777 
778 /* determine if we can go parallel or must use a serialized parallel region and
779  * how many threads we can use
780  * set_nproc is the number of threads requested for the team
781  * returns 0 if we should serialize or only use one thread,
782  * otherwise the number of threads to use
783  * The forkjoin lock is held by the caller. */
784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785                                  int master_tid, int set_nthreads,
786                                  int enter_teams) {
787   int capacity;
788   int new_nthreads;
789   KMP_DEBUG_ASSERT(__kmp_init_serial);
790   KMP_DEBUG_ASSERT(root && parent_team);
791   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792 
793   // If dyn-var is set, dynamically adjust the number of desired threads,
794   // according to the method specified by dynamic_mode.
795   new_nthreads = set_nthreads;
796   if (!get__dynamic_2(parent_team, master_tid)) {
797     ;
798   }
799 #ifdef USE_LOAD_BALANCE
800   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802     if (new_nthreads == 1) {
803       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804                     "reservation to 1 thread\n",
805                     master_tid));
806       return 1;
807     }
808     if (new_nthreads < set_nthreads) {
809       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810                     "reservation to %d threads\n",
811                     master_tid, new_nthreads));
812     }
813   }
814 #endif /* USE_LOAD_BALANCE */
815   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816     new_nthreads = __kmp_avail_proc - __kmp_nth +
817                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818     if (new_nthreads <= 1) {
819       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820                     "reservation to 1 thread\n",
821                     master_tid));
822       return 1;
823     }
824     if (new_nthreads < set_nthreads) {
825       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826                     "reservation to %d threads\n",
827                     master_tid, new_nthreads));
828     } else {
829       new_nthreads = set_nthreads;
830     }
831   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832     if (set_nthreads > 2) {
833       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834       new_nthreads = (new_nthreads % set_nthreads) + 1;
835       if (new_nthreads == 1) {
836         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837                       "reservation to 1 thread\n",
838                       master_tid));
839         return 1;
840       }
841       if (new_nthreads < set_nthreads) {
842         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843                       "reservation to %d threads\n",
844                       master_tid, new_nthreads));
845       }
846     }
847   } else {
848     KMP_ASSERT(0);
849   }
850 
851   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852   if (__kmp_nth + new_nthreads -
853           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854       __kmp_max_nth) {
855     int tl_nthreads = __kmp_max_nth - __kmp_nth +
856                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857     if (tl_nthreads <= 0) {
858       tl_nthreads = 1;
859     }
860 
861     // If dyn-var is false, emit a 1-time warning.
862     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863       __kmp_reserve_warn = 1;
864       __kmp_msg(kmp_ms_warning,
865                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867     }
868     if (tl_nthreads == 1) {
869       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870                     "reduced reservation to 1 thread\n",
871                     master_tid));
872       return 1;
873     }
874     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875                   "reservation to %d threads\n",
876                   master_tid, tl_nthreads));
877     new_nthreads = tl_nthreads;
878   }
879 
880   // Respect OMP_THREAD_LIMIT
881   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883   if (cg_nthreads + new_nthreads -
884           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885       max_cg_threads) {
886     int tl_nthreads = max_cg_threads - cg_nthreads +
887                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888     if (tl_nthreads <= 0) {
889       tl_nthreads = 1;
890     }
891 
892     // If dyn-var is false, emit a 1-time warning.
893     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894       __kmp_reserve_warn = 1;
895       __kmp_msg(kmp_ms_warning,
896                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898     }
899     if (tl_nthreads == 1) {
900       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901                     "reduced reservation to 1 thread\n",
902                     master_tid));
903       return 1;
904     }
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906                   "reservation to %d threads\n",
907                   master_tid, tl_nthreads));
908     new_nthreads = tl_nthreads;
909   }
910 
911   // Check if the threads array is large enough, or needs expanding.
912   // See comment in __kmp_register_root() about the adjustment if
913   // __kmp_threads[0] == NULL.
914   capacity = __kmp_threads_capacity;
915   if (TCR_PTR(__kmp_threads[0]) == NULL) {
916     --capacity;
917   }
918   if (__kmp_nth + new_nthreads -
919           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920       capacity) {
921     // Expand the threads array.
922     int slotsRequired = __kmp_nth + new_nthreads -
923                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924                         capacity;
925     int slotsAdded = __kmp_expand_threads(slotsRequired);
926     if (slotsAdded < slotsRequired) {
927       // The threads array was not expanded enough.
928       new_nthreads -= (slotsRequired - slotsAdded);
929       KMP_ASSERT(new_nthreads >= 1);
930 
931       // If dyn-var is false, emit a 1-time warning.
932       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933         __kmp_reserve_warn = 1;
934         if (__kmp_tp_cached) {
935           __kmp_msg(kmp_ms_warning,
936                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939         } else {
940           __kmp_msg(kmp_ms_warning,
941                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943         }
944       }
945     }
946   }
947 
948 #ifdef KMP_DEBUG
949   if (new_nthreads == 1) {
950     KC_TRACE(10,
951              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952               "dead roots and rechecking; requested %d threads\n",
953               __kmp_get_gtid(), set_nthreads));
954   } else {
955     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956                   " %d threads\n",
957                   __kmp_get_gtid(), new_nthreads, set_nthreads));
958   }
959 #endif // KMP_DEBUG
960   return new_nthreads;
961 }
962 
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964    assured that there are enough threads available, because we checked on that
965    earlier within critical section forkjoin */
966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967                                     kmp_info_t *master_th, int master_gtid) {
968   int i;
969   int use_hot_team;
970 
971   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973   KMP_MB();
974 
975   /* first, let's setup the master thread */
976   master_th->th.th_info.ds.ds_tid = 0;
977   master_th->th.th_team = team;
978   master_th->th.th_team_nproc = team->t.t_nproc;
979   master_th->th.th_team_master = master_th;
980   master_th->th.th_team_serialized = FALSE;
981   master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985   use_hot_team = 0;
986   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987   if (hot_teams) { // hot teams array is not allocated if
988     // KMP_HOT_TEAMS_MAX_LEVEL=0
989     int level = team->t.t_active_level - 1; // index in array of hot teams
990     if (master_th->th.th_teams_microtask) { // are we inside the teams?
991       if (master_th->th.th_teams_size.nteams > 1) {
992         ++level; // level was not increased in teams construct for
993         // team_of_masters
994       }
995       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996           master_th->th.th_teams_level == team->t.t_level) {
997         ++level; // level was not increased in teams construct for
998         // team_of_workers before the parallel
999       } // team->t.t_level will be increased inside parallel
1000     }
1001     if (level < __kmp_hot_teams_max_level) {
1002       if (hot_teams[level].hot_team) {
1003         // hot team has already been allocated for given level
1004         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005         use_hot_team = 1; // the team is ready to use
1006       } else {
1007         use_hot_team = 0; // AC: threads are not allocated yet
1008         hot_teams[level].hot_team = team; // remember new hot team
1009         hot_teams[level].hot_team_nth = team->t.t_nproc;
1010       }
1011     } else {
1012       use_hot_team = 0;
1013     }
1014   }
1015 #else
1016   use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018   if (!use_hot_team) {
1019 
1020     /* install the master thread */
1021     team->t.t_threads[0] = master_th;
1022     __kmp_initialize_info(master_th, team, 0, master_gtid);
1023 
1024     /* now, install the worker threads */
1025     for (i = 1; i < team->t.t_nproc; i++) {
1026 
1027       /* fork or reallocate a new thread and install it in team */
1028       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029       team->t.t_threads[i] = thr;
1030       KMP_DEBUG_ASSERT(thr);
1031       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032       /* align team and thread arrived states */
1033       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038                     team->t.t_bar[bs_plain_barrier].b_arrived));
1039       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040       thr->th.th_teams_level = master_th->th.th_teams_level;
1041       thr->th.th_teams_size = master_th->th.th_teams_size;
1042       { // Initialize threads' barrier data.
1043         int b;
1044         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045         for (b = 0; b < bs_last_barrier; ++b) {
1046           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051         }
1052       }
1053     }
1054 
1055 #if KMP_AFFINITY_SUPPORTED
1056     __kmp_partition_places(team);
1057 #endif
1058   }
1059 
1060   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061     for (i = 0; i < team->t.t_nproc; i++) {
1062       kmp_info_t *thr = team->t.t_threads[i];
1063       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064           thr->th.th_prev_level != team->t.t_level) {
1065         team->t.t_display_affinity = 1;
1066         break;
1067       }
1068     }
1069   }
1070 
1071   KMP_MB();
1072 }
1073 
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
1078 inline static void propagateFPControl(kmp_team_t *team) {
1079   if (__kmp_inherit_fp_control) {
1080     kmp_int16 x87_fpu_control_word;
1081     kmp_uint32 mxcsr;
1082 
1083     // Get master values of FPU control flags (both X87 and vector)
1084     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085     __kmp_store_mxcsr(&mxcsr);
1086     mxcsr &= KMP_X86_MXCSR_MASK;
1087 
1088     // There is no point looking at t_fp_control_saved here.
1089     // If it is TRUE, we still have to update the values if they are different
1090     // from those we now have. If it is FALSE we didn't save anything yet, but
1091     // our objective is the same. We have to ensure that the values in the team
1092     // are the same as those we have.
1093     // So, this code achieves what we need whether or not t_fp_control_saved is
1094     // true. By checking whether the value needs updating we avoid unnecessary
1095     // writes that would put the cache-line into a written state, causing all
1096     // threads in the team to have to read it again.
1097     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099     // Although we don't use this value, other code in the runtime wants to know
1100     // whether it should restore them. So we must ensure it is correct.
1101     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102   } else {
1103     // Similarly here. Don't write to this cache-line in the team structure
1104     // unless we have to.
1105     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106   }
1107 }
1108 
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113     // Only reset the fp control regs if they have been changed in the team.
1114     // the parallel region that we are exiting.
1115     kmp_int16 x87_fpu_control_word;
1116     kmp_uint32 mxcsr;
1117     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118     __kmp_store_mxcsr(&mxcsr);
1119     mxcsr &= KMP_X86_MXCSR_MASK;
1120 
1121     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122       __kmp_clear_x87_fpu_status_word();
1123       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124     }
1125 
1126     if (team->t.t_mxcsr != mxcsr) {
1127       __kmp_load_mxcsr(&team->t.t_mxcsr);
1128     }
1129   }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135 
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137                                      int realloc); // forward declaration
1138 
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140    single master thread. */
1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142   kmp_info_t *this_thr;
1143   kmp_team_t *serial_team;
1144 
1145   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146 
1147   /* Skip all this code for autopar serialized loops since it results in
1148      unacceptable overhead */
1149   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150     return;
1151 
1152   if (!TCR_4(__kmp_init_parallel))
1153     __kmp_parallel_initialize();
1154   __kmp_resume_if_soft_paused();
1155 
1156   this_thr = __kmp_threads[global_tid];
1157   serial_team = this_thr->th.th_serial_team;
1158 
1159   /* utilize the serialized team held by this thread */
1160   KMP_DEBUG_ASSERT(serial_team);
1161   KMP_MB();
1162 
1163   if (__kmp_tasking_mode != tskm_immediate_exec) {
1164     KMP_DEBUG_ASSERT(
1165         this_thr->th.th_task_team ==
1166         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168                      NULL);
1169     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170                   "team %p, new task_team = NULL\n",
1171                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172     this_thr->th.th_task_team = NULL;
1173   }
1174 
1175   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177     proc_bind = proc_bind_false;
1178   } else if (proc_bind == proc_bind_default) {
1179     // No proc_bind clause was specified, so use the current value
1180     // of proc-bind-var for this parallel region.
1181     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182   }
1183   // Reset for next parallel region
1184   this_thr->th.th_set_proc_bind = proc_bind_default;
1185 
1186 #if OMPT_SUPPORT
1187   ompt_data_t ompt_parallel_data = ompt_data_none;
1188   ompt_data_t *implicit_task_data;
1189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190   if (ompt_enabled.enabled &&
1191       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192 
1193     ompt_task_info_t *parent_task_info;
1194     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195 
1196     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197     if (ompt_enabled.ompt_callback_parallel_begin) {
1198       int team_size = 1;
1199 
1200       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201           &(parent_task_info->task_data), &(parent_task_info->frame),
1202           &ompt_parallel_data, team_size,
1203           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204     }
1205   }
1206 #endif // OMPT_SUPPORT
1207 
1208   if (this_thr->th.th_team != serial_team) {
1209     // Nested level will be an index in the nested nthreads array
1210     int level = this_thr->th.th_team->t.t_level;
1211 
1212     if (serial_team->t.t_serialized) {
1213       /* this serial team was already used
1214          TODO increase performance by making this locks more specific */
1215       kmp_team_t *new_team;
1216 
1217       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218 
1219       new_team =
1220           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222                               ompt_parallel_data,
1223 #endif
1224                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1225                               0 USE_NESTED_HOT_ARG(NULL));
1226       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227       KMP_ASSERT(new_team);
1228 
1229       /* setup new serialized team and install it */
1230       new_team->t.t_threads[0] = this_thr;
1231       new_team->t.t_parent = this_thr->th.th_team;
1232       serial_team = new_team;
1233       this_thr->th.th_serial_team = serial_team;
1234 
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238            global_tid, serial_team));
1239 
1240       /* TODO the above breaks the requirement that if we run out of resources,
1241          then we can still guarantee that serialized teams are ok, since we may
1242          need to allocate a new one */
1243     } else {
1244       KF_TRACE(
1245           10,
1246           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247            global_tid, serial_team));
1248     }
1249 
1250     /* we have to initialize this serial team */
1251     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254     serial_team->t.t_ident = loc;
1255     serial_team->t.t_serialized = 1;
1256     serial_team->t.t_nproc = 1;
1257     serial_team->t.t_parent = this_thr->th.th_team;
1258     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259     this_thr->th.th_team = serial_team;
1260     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261 
1262     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263                   this_thr->th.th_current_task));
1264     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265     this_thr->th.th_current_task->td_flags.executing = 0;
1266 
1267     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268 
1269     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270        implicit task for each serialized task represented by
1271        team->t.t_serialized? */
1272     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273               &this_thr->th.th_current_task->td_parent->td_icvs);
1274 
1275     // Thread value exists in the nested nthreads array for the next nested
1276     // level
1277     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278       this_thr->th.th_current_task->td_icvs.nproc =
1279           __kmp_nested_nth.nth[level + 1];
1280     }
1281 
1282     if (__kmp_nested_proc_bind.used &&
1283         (level + 1 < __kmp_nested_proc_bind.used)) {
1284       this_thr->th.th_current_task->td_icvs.proc_bind =
1285           __kmp_nested_proc_bind.bind_types[level + 1];
1286     }
1287 
1288 #if USE_DEBUGGER
1289     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291     this_thr->th.th_info.ds.ds_tid = 0;
1292 
1293     /* set thread cache values */
1294     this_thr->th.th_team_nproc = 1;
1295     this_thr->th.th_team_master = this_thr;
1296     this_thr->th.th_team_serialized = 1;
1297 
1298     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301 
1302     propagateFPControl(serial_team);
1303 
1304     /* check if we need to allocate dispatch buffers stack */
1305     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307       serial_team->t.t_dispatch->th_disp_buffer =
1308           (dispatch_private_info_t *)__kmp_allocate(
1309               sizeof(dispatch_private_info_t));
1310     }
1311     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312 
1313     KMP_MB();
1314 
1315   } else {
1316     /* this serialized team is already being used,
1317      * that's fine, just add another nested level */
1318     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321     ++serial_team->t.t_serialized;
1322     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323 
1324     // Nested level will be an index in the nested nthreads array
1325     int level = this_thr->th.th_team->t.t_level;
1326     // Thread value exists in the nested nthreads array for the next nested
1327     // level
1328     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329       this_thr->th.th_current_task->td_icvs.nproc =
1330           __kmp_nested_nth.nth[level + 1];
1331     }
1332     serial_team->t.t_level++;
1333     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334                   "of serial team %p to %d\n",
1335                   global_tid, serial_team, serial_team->t.t_level));
1336 
1337     /* allocate/push dispatch buffers stack */
1338     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339     {
1340       dispatch_private_info_t *disp_buffer =
1341           (dispatch_private_info_t *)__kmp_allocate(
1342               sizeof(dispatch_private_info_t));
1343       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345     }
1346     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347 
1348     KMP_MB();
1349   }
1350   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351 
1352   // Perform the display affinity functionality for
1353   // serialized parallel regions
1354   if (__kmp_display_affinity) {
1355     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356         this_thr->th.th_prev_num_threads != 1) {
1357       // NULL means use the affinity-format-var ICV
1358       __kmp_aux_display_affinity(global_tid, NULL);
1359       this_thr->th.th_prev_level = serial_team->t.t_level;
1360       this_thr->th.th_prev_num_threads = 1;
1361     }
1362   }
1363 
1364   if (__kmp_env_consistency_check)
1365     __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367   serial_team->t.ompt_team_info.master_return_address = codeptr;
1368   if (ompt_enabled.enabled &&
1369       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371 
1372     ompt_lw_taskteam_t lw_taskteam;
1373     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374                             &ompt_parallel_data, codeptr);
1375 
1376     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377     // don't use lw_taskteam after linking. content was swaped
1378 
1379     /* OMPT implicit task begin */
1380     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381     if (ompt_enabled.ompt_callback_implicit_task) {
1382       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385       OMPT_CUR_TASK_INFO(this_thr)
1386           ->thread_num = __kmp_tid_from_gtid(global_tid);
1387     }
1388 
1389     /* OMPT state */
1390     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392   }
1393 #endif
1394 }
1395 
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399                     enum fork_context_e call_context, // Intel, GNU, ...
1400                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401                     kmp_va_list ap) {
1402   void **argv;
1403   int i;
1404   int master_tid;
1405   int master_this_cons;
1406   kmp_team_t *team;
1407   kmp_team_t *parent_team;
1408   kmp_info_t *master_th;
1409   kmp_root_t *root;
1410   int nthreads;
1411   int master_active;
1412   int master_set_numthreads;
1413   int level;
1414   int active_level;
1415   int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417   kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419   { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425       /* Some systems prefer the stack for the root thread(s) to start with */
1426       /* some gap from the parent stack to prevent false sharing. */
1427       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428       /* These 2 lines below are so this does not get optimized out */
1429       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430         __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT(
1435         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436     if (!TCR_4(__kmp_init_parallel))
1437       __kmp_parallel_initialize();
1438     __kmp_resume_if_soft_paused();
1439 
1440     /* setup current data */
1441     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442     // shutdown
1443     parent_team = master_th->th.th_team;
1444     master_tid = master_th->th.th_info.ds.ds_tid;
1445     master_this_cons = master_th->th.th_local.this_construct;
1446     root = master_th->th.th_root;
1447     master_active = root->r.r_active;
1448     master_set_numthreads = master_th->th.th_set_nproc;
1449 
1450 #if OMPT_SUPPORT
1451     ompt_data_t ompt_parallel_data = ompt_data_none;
1452     ompt_data_t *parent_task_data;
1453     ompt_frame_t *ompt_frame;
1454     ompt_data_t *implicit_task_data;
1455     void *return_address = NULL;
1456 
1457     if (ompt_enabled.enabled) {
1458       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459                                     NULL, NULL);
1460       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461     }
1462 #endif
1463 
1464     // Nested level will be an index in the nested nthreads array
1465     level = parent_team->t.t_level;
1466     // used to launch non-serial teams even if nested is not allowed
1467     active_level = parent_team->t.t_active_level;
1468     // needed to check nesting inside the teams
1469     teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471     p_hot_teams = &master_th->th.th_hot_teams;
1472     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476       // it is either actual or not needed (when active_level > 0)
1477       (*p_hot_teams)[0].hot_team_nth = 1;
1478     }
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482     if (ompt_enabled.enabled) {
1483       if (ompt_enabled.ompt_callback_parallel_begin) {
1484         int team_size = master_set_numthreads
1485                             ? master_set_numthreads
1486                             : get__nproc_2(parent_team, master_tid);
1487         int flags = OMPT_INVOKER(call_context) |
1488                     ((microtask == (microtask_t)__kmp_teams_master)
1489                          ? ompt_parallel_league
1490                          : ompt_parallel_team);
1491         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493             return_address);
1494       }
1495       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496     }
1497 #endif
1498 
1499     master_th->th.th_ident = loc;
1500 
1501     if (master_th->th.th_teams_microtask && ap &&
1502         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503       // AC: This is start of parallel that is nested inside teams construct.
1504       // The team is actual (hot), all workers are ready at the fork barrier.
1505       // No lock needed to initialize the team a bit, then free workers.
1506       parent_team->t.t_ident = loc;
1507       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508       parent_team->t.t_argc = argc;
1509       argv = (void **)parent_team->t.t_argv;
1510       for (i = argc - 1; i >= 0; --i)
1511         *argv++ = va_arg(kmp_va_deref(ap), void *);
1512       // Increment our nested depth levels, but not increase the serialization
1513       if (parent_team == master_th->th.th_serial_team) {
1514         // AC: we are in serialized parallel
1515         __kmpc_serialized_parallel(loc, gtid);
1516         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517 
1518         if (call_context == fork_context_gnu) {
1519           // AC: need to decrement t_serialized for enquiry functions to work
1520           // correctly, will restore at join time
1521           parent_team->t.t_serialized--;
1522           return TRUE;
1523         }
1524 
1525 #if OMPT_SUPPORT
1526         void *dummy;
1527         void **exit_frame_p;
1528 
1529         ompt_lw_taskteam_t lw_taskteam;
1530 
1531         if (ompt_enabled.enabled) {
1532           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533                                   &ompt_parallel_data, return_address);
1534           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535 
1536           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537           // don't use lw_taskteam after linking. content was swaped
1538 
1539           /* OMPT implicit task begin */
1540           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541           if (ompt_enabled.ompt_callback_implicit_task) {
1542             OMPT_CUR_TASK_INFO(master_th)
1543                 ->thread_num = __kmp_tid_from_gtid(gtid);
1544             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546                 implicit_task_data, 1,
1547                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548           }
1549 
1550           /* OMPT state */
1551           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552         } else {
1553           exit_frame_p = &dummy;
1554         }
1555 #endif
1556         // AC: need to decrement t_serialized for enquiry functions to work
1557         // correctly, will restore at join time
1558         parent_team->t.t_serialized--;
1559 
1560         {
1561           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565                                  ,
1566                                  exit_frame_p
1567 #endif
1568                                  );
1569         }
1570 
1571 #if OMPT_SUPPORT
1572         if (ompt_enabled.enabled) {
1573           *exit_frame_p = NULL;
1574           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575           if (ompt_enabled.ompt_callback_implicit_task) {
1576             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577                 ompt_scope_end, NULL, implicit_task_data, 1,
1578                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579           }
1580           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581           __ompt_lw_taskteam_unlink(master_th);
1582           if (ompt_enabled.ompt_callback_parallel_end) {
1583             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1586                 return_address);
1587           }
1588           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589         }
1590 #endif
1591         return TRUE;
1592       }
1593 
1594       parent_team->t.t_pkfn = microtask;
1595       parent_team->t.t_invoke = invoker;
1596       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597       parent_team->t.t_active_level++;
1598       parent_team->t.t_level++;
1599       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600 
1601 #if OMPT_SUPPORT
1602       if (ompt_enabled.enabled) {
1603         ompt_lw_taskteam_t lw_taskteam;
1604         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605                                 &ompt_parallel_data, return_address);
1606         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607       }
1608 #endif
1609 
1610       /* Change number of threads in the team if requested */
1611       if (master_set_numthreads) { // The parallel has num_threads clause
1612         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613           // AC: only can reduce number of threads dynamically, can't increase
1614           kmp_info_t **other_threads = parent_team->t.t_threads;
1615           parent_team->t.t_nproc = master_set_numthreads;
1616           for (i = 0; i < master_set_numthreads; ++i) {
1617             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618           }
1619           // Keep extra threads hot in the team for possible next parallels
1620         }
1621         master_th->th.th_set_nproc = 0;
1622       }
1623 
1624 #if USE_DEBUGGER
1625       if (__kmp_debugging) { // Let debugger override number of threads.
1626         int nth = __kmp_omp_num_threads(loc);
1627         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628           master_set_numthreads = nth;
1629         }
1630       }
1631 #endif
1632 
1633 #if USE_ITT_BUILD
1634       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635            KMP_ITT_DEBUG) &&
1636           __kmp_forkjoin_frames_mode == 3 &&
1637           parent_team->t.t_active_level == 1 // only report frames at level 1
1638           && master_th->th.th_teams_size.nteams == 1) {
1639         kmp_uint64 tmp_time = __itt_get_timestamp();
1640         master_th->th.th_frame_time = tmp_time;
1641         parent_team->t.t_region_time = tmp_time;
1642       }
1643       if (__itt_stack_caller_create_ptr) {
1644         // create new stack stitching id before entering fork barrier
1645         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646       }
1647 #endif /* USE_ITT_BUILD */
1648 
1649       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650                     "master_th=%p, gtid=%d\n",
1651                     root, parent_team, master_th, gtid));
1652       __kmp_internal_fork(loc, gtid, parent_team);
1653       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654                     "master_th=%p, gtid=%d\n",
1655                     root, parent_team, master_th, gtid));
1656 
1657       if (call_context == fork_context_gnu)
1658         return TRUE;
1659 
1660       /* Invoke microtask for MASTER thread */
1661       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662                     parent_team->t.t_id, parent_team->t.t_pkfn));
1663 
1664       if (!parent_team->t.t_invoke(gtid)) {
1665         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666       }
1667       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668                     parent_team->t.t_id, parent_team->t.t_pkfn));
1669       KMP_MB(); /* Flush all pending memory write invalidates.  */
1670 
1671       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672 
1673       return TRUE;
1674     } // Parallel closely nested in teams construct
1675 
1676 #if KMP_DEBUG
1677     if (__kmp_tasking_mode != tskm_immediate_exec) {
1678       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1680     }
1681 #endif
1682 
1683     if (parent_team->t.t_active_level >=
1684         master_th->th.th_current_task->td_icvs.max_active_levels) {
1685       nthreads = 1;
1686     } else {
1687       int enter_teams = ((ap == NULL && active_level == 0) ||
1688                          (ap && teams_level > 0 && teams_level == level));
1689       nthreads =
1690           master_set_numthreads
1691               ? master_set_numthreads
1692               : get__nproc_2(
1693                     parent_team,
1694                     master_tid); // TODO: get nproc directly from current task
1695 
1696       // Check if we need to take forkjoin lock? (no need for serialized
1697       // parallel out of teams construct). This code moved here from
1698       // __kmp_reserve_threads() to speedup nested serialized parallels.
1699       if (nthreads > 1) {
1700         if ((get__max_active_levels(master_th) == 1 &&
1701              (root->r.r_in_parallel && !enter_teams)) ||
1702             (__kmp_library == library_serial)) {
1703           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704                         " threads\n",
1705                         gtid, nthreads));
1706           nthreads = 1;
1707         }
1708       }
1709       if (nthreads > 1) {
1710         /* determine how many new threads we can use */
1711         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712         /* AC: If we execute teams from parallel region (on host), then teams
1713            should be created but each can only have 1 thread if nesting is
1714            disabled. If teams called from serial region, then teams and their
1715            threads should be created regardless of the nesting setting. */
1716         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717                                          nthreads, enter_teams);
1718         if (nthreads == 1) {
1719           // Free lock for single thread execution here; for multi-thread
1720           // execution it will be freed later after team of threads created
1721           // and initialized
1722           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723         }
1724       }
1725     }
1726     KMP_DEBUG_ASSERT(nthreads > 0);
1727 
1728     // If we temporarily changed the set number of threads then restore it now
1729     master_th->th.th_set_nproc = 0;
1730 
1731     /* create a serialized parallel region? */
1732     if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX &&                                                            \
1735     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736       void *args[argc];
1737 #else
1738       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740           KMP_ARCH_AARCH64) */
1741 
1742       KA_TRACE(20,
1743                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744 
1745       __kmpc_serialized_parallel(loc, gtid);
1746 
1747       if (call_context == fork_context_intel) {
1748         /* TODO this sucks, use the compiler itself to pass args! :) */
1749         master_th->th.th_serial_team->t.t_ident = loc;
1750         if (!ap) {
1751           // revert change made in __kmpc_serialized_parallel()
1752           master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754 
1755 #if OMPT_SUPPORT
1756           void *dummy;
1757           void **exit_frame_p;
1758           ompt_task_info_t *task_info;
1759 
1760           ompt_lw_taskteam_t lw_taskteam;
1761 
1762           if (ompt_enabled.enabled) {
1763             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764                                     &ompt_parallel_data, return_address);
1765 
1766             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767             // don't use lw_taskteam after linking. content was swaped
1768 
1769             task_info = OMPT_CUR_TASK_INFO(master_th);
1770             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771             if (ompt_enabled.ompt_callback_implicit_task) {
1772               OMPT_CUR_TASK_INFO(master_th)
1773                   ->thread_num = __kmp_tid_from_gtid(gtid);
1774               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776                   &(task_info->task_data), 1,
1777                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778                   ompt_task_implicit);
1779             }
1780 
1781             /* OMPT state */
1782             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783           } else {
1784             exit_frame_p = &dummy;
1785           }
1786 #endif
1787 
1788           {
1789             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792                                    parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794                                    ,
1795                                    exit_frame_p
1796 #endif
1797                                    );
1798           }
1799 
1800 #if OMPT_SUPPORT
1801           if (ompt_enabled.enabled) {
1802             *exit_frame_p = NULL;
1803             if (ompt_enabled.ompt_callback_implicit_task) {
1804               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1806                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807                   ompt_task_implicit);
1808             }
1809             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810             __ompt_lw_taskteam_unlink(master_th);
1811             if (ompt_enabled.ompt_callback_parallel_end) {
1812               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813                   &ompt_parallel_data, parent_task_data,
1814                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1815                   return_address);
1816             }
1817             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818           }
1819 #endif
1820         } else if (microtask == (microtask_t)__kmp_teams_master) {
1821           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822                            master_th->th.th_serial_team);
1823           team = master_th->th.th_team;
1824           // team->t.t_pkfn = microtask;
1825           team->t.t_invoke = invoker;
1826           __kmp_alloc_argv_entries(argc, team, TRUE);
1827           team->t.t_argc = argc;
1828           argv = (void **)team->t.t_argv;
1829           if (ap) {
1830             for (i = argc - 1; i >= 0; --i)
1831               *argv++ = va_arg(kmp_va_deref(ap), void *);
1832           } else {
1833             for (i = 0; i < argc; ++i)
1834               // Get args from parent team for teams construct
1835               argv[i] = parent_team->t.t_argv[i];
1836           }
1837           // AC: revert change made in __kmpc_serialized_parallel()
1838           //     because initial code in teams should have level=0
1839           team->t.t_level--;
1840           // AC: call special invoker for outer "parallel" of teams construct
1841           invoker(gtid);
1842 #if OMPT_SUPPORT
1843           if (ompt_enabled.enabled) {
1844             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845             if (ompt_enabled.ompt_callback_implicit_task) {
1846               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1848                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849             }
1850             if (ompt_enabled.ompt_callback_parallel_end) {
1851               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852                   &ompt_parallel_data, parent_task_data,
1853                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1854                   return_address);
1855             }
1856             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857           }
1858 #endif
1859         } else {
1860           argv = args;
1861           for (i = argc - 1; i >= 0; --i)
1862             *argv++ = va_arg(kmp_va_deref(ap), void *);
1863           KMP_MB();
1864 
1865 #if OMPT_SUPPORT
1866           void *dummy;
1867           void **exit_frame_p;
1868           ompt_task_info_t *task_info;
1869 
1870           ompt_lw_taskteam_t lw_taskteam;
1871 
1872           if (ompt_enabled.enabled) {
1873             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874                                     &ompt_parallel_data, return_address);
1875             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876             // don't use lw_taskteam after linking. content was swaped
1877             task_info = OMPT_CUR_TASK_INFO(master_th);
1878             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879 
1880             /* OMPT implicit task begin */
1881             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882             if (ompt_enabled.ompt_callback_implicit_task) {
1883               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886                   ompt_task_implicit);
1887               OMPT_CUR_TASK_INFO(master_th)
1888                   ->thread_num = __kmp_tid_from_gtid(gtid);
1889             }
1890 
1891             /* OMPT state */
1892             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893           } else {
1894             exit_frame_p = &dummy;
1895           }
1896 #endif
1897 
1898           {
1899             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903                                    ,
1904                                    exit_frame_p
1905 #endif
1906                                    );
1907           }
1908 
1909 #if OMPT_SUPPORT
1910           if (ompt_enabled.enabled) {
1911             *exit_frame_p = NULL;
1912             if (ompt_enabled.ompt_callback_implicit_task) {
1913               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1915                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916                   ompt_task_implicit);
1917             }
1918 
1919             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920             __ompt_lw_taskteam_unlink(master_th);
1921             if (ompt_enabled.ompt_callback_parallel_end) {
1922               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923                   &ompt_parallel_data, parent_task_data,
1924                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1925                   return_address);
1926             }
1927             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928           }
1929 #endif
1930         }
1931       } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933         ompt_lw_taskteam_t lwt;
1934         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935                                 return_address);
1936 
1937         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942         // we were called from GNU native code
1943         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944         return FALSE;
1945       } else {
1946         KMP_ASSERT2(call_context < fork_context_last,
1947                     "__kmp_fork_call: unknown fork_context parameter");
1948       }
1949 
1950       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951       KMP_MB();
1952       return FALSE;
1953     } // if (nthreads == 1)
1954 
1955     // GEH: only modify the executing flag in the case when not serialized
1956     //      serialized case is handled in kmpc_serialized_parallel
1957     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958                   "curtask=%p, curtask_max_aclevel=%d\n",
1959                   parent_team->t.t_active_level, master_th,
1960                   master_th->th.th_current_task,
1961                   master_th->th.th_current_task->td_icvs.max_active_levels));
1962     // TODO: GEH - cannot do this assertion because root thread not set up as
1963     // executing
1964     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965     master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967     if (!master_th->th.th_teams_microtask || level > teams_level) {
1968       /* Increment our nested depth level */
1969       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970     }
1971 
1972     // See if we need to make a copy of the ICVs.
1973     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974     if ((level + 1 < __kmp_nested_nth.used) &&
1975         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977     } else {
1978       nthreads_icv = 0; // don't update
1979     }
1980 
1981     // Figure out the proc_bind_policy for the new team.
1982     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983     kmp_proc_bind_t proc_bind_icv =
1984         proc_bind_default; // proc_bind_default means don't update
1985     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986       proc_bind = proc_bind_false;
1987     } else {
1988       if (proc_bind == proc_bind_default) {
1989         // No proc_bind clause specified; use current proc-bind-var for this
1990         // parallel region
1991         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992       }
1993       /* else: The proc_bind policy was specified explicitly on parallel clause.
1994          This overrides proc-bind-var for this parallel region, but does not
1995          change proc-bind-var. */
1996       // Figure the value of proc-bind-var for the child threads.
1997       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999            master_th->th.th_current_task->td_icvs.proc_bind)) {
2000         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001       }
2002     }
2003 
2004     // Reset for next parallel region
2005     master_th->th.th_set_proc_bind = proc_bind_default;
2006 
2007     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008       kmp_internal_control_t new_icvs;
2009       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010       new_icvs.next = NULL;
2011       if (nthreads_icv > 0) {
2012         new_icvs.nproc = nthreads_icv;
2013       }
2014       if (proc_bind_icv != proc_bind_default) {
2015         new_icvs.proc_bind = proc_bind_icv;
2016       }
2017 
2018       /* allocate a new parallel team */
2019       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020       team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022                                  ompt_parallel_data,
2023 #endif
2024                                  proc_bind, &new_icvs,
2025                                  argc USE_NESTED_HOT_ARG(master_th));
2026     } else {
2027       /* allocate a new parallel team */
2028       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029       team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031                                  ompt_parallel_data,
2032 #endif
2033                                  proc_bind,
2034                                  &master_th->th.th_current_task->td_icvs,
2035                                  argc USE_NESTED_HOT_ARG(master_th));
2036     }
2037     KF_TRACE(
2038         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039 
2040     /* setup the new team */
2041     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048                           return_address);
2049 #endif
2050     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051     // TODO: parent_team->t.t_level == INT_MAX ???
2052     if (!master_th->th.th_teams_microtask || level > teams_level) {
2053       int new_level = parent_team->t.t_level + 1;
2054       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055       new_level = parent_team->t.t_active_level + 1;
2056       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057     } else {
2058       // AC: Do not increase parallel level at start of the teams construct
2059       int new_level = parent_team->t.t_level;
2060       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061       new_level = parent_team->t.t_active_level;
2062       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063     }
2064     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065     // set master's schedule as new run-time schedule
2066     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067 
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (master_th->th.th_hot_teams &&
2113             active_level < __kmp_hot_teams_max_level &&
2114             team == master_th->th.th_hot_teams[active_level].hot_team) {
2115           // Restore master's nested state if nested hot team
2116           master_th->th.th_task_state =
2117               master_th->th
2118                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119         } else {
2120 #endif
2121           master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123         }
2124 #endif
2125       }
2126 #if !KMP_NESTED_HOT_TEAMS
2127       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128                        (team == root->r.r_hot_team));
2129 #endif
2130     }
2131 
2132     KA_TRACE(
2133         20,
2134         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136          team->t.t_nproc));
2137     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138                      (team->t.t_master_tid == 0 &&
2139                       (team->t.t_parent == root->r.r_root_team ||
2140                        team->t.t_parent->t.t_serialized)));
2141     KMP_MB();
2142 
2143     /* now, setup the arguments */
2144     argv = (void **)team->t.t_argv;
2145     if (ap) {
2146       for (i = argc - 1; i >= 0; --i) {
2147         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148         KMP_CHECK_UPDATE(*argv, new_argv);
2149         argv++;
2150       }
2151     } else {
2152       for (i = 0; i < argc; ++i) {
2153         // Get args from parent team for teams construct
2154         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155       }
2156     }
2157 
2158     /* now actually fork the threads */
2159     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161       root->r.r_active = TRUE;
2162 
2163     __kmp_fork_team_threads(root, team, master_th, gtid);
2164     __kmp_setup_icv_copy(team, nthreads,
2165                          &master_th->th.th_current_task->td_icvs, loc);
2166 
2167 #if OMPT_SUPPORT
2168     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170 
2171     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172 
2173 #if USE_ITT_BUILD
2174     if (team->t.t_active_level == 1 // only report frames at level 1
2175         && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178           (__kmp_forkjoin_frames_mode == 3 ||
2179            __kmp_forkjoin_frames_mode == 1)) {
2180         kmp_uint64 tmp_time = 0;
2181         if (__itt_get_timestamp_ptr)
2182           tmp_time = __itt_get_timestamp();
2183         // Internal fork - report frame begin
2184         master_th->th.th_frame_time = tmp_time;
2185         if (__kmp_forkjoin_frames_mode == 3)
2186           team->t.t_region_time = tmp_time;
2187       } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194       }
2195     }
2196 #endif /* USE_ITT_BUILD */
2197 
2198     /* now go on and do the work */
2199     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200     KMP_MB();
2201     KF_TRACE(10,
2202              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203               root, team, master_th, gtid));
2204 
2205 #if USE_ITT_BUILD
2206     if (__itt_stack_caller_create_ptr) {
2207       team->t.t_stack_id =
2208           __kmp_itt_stack_caller_create(); // create new stack stitching id
2209       // before entering fork barrier
2210     }
2211 #endif /* USE_ITT_BUILD */
2212 
2213     // AC: skip __kmp_internal_fork at teams construct, let only master
2214     // threads execute
2215     if (ap) {
2216       __kmp_internal_fork(loc, gtid, team);
2217       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218                     "master_th=%p, gtid=%d\n",
2219                     root, team, master_th, gtid));
2220     }
2221 
2222     if (call_context == fork_context_gnu) {
2223       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224       return TRUE;
2225     }
2226 
2227     /* Invoke microtask for MASTER thread */
2228     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229                   team->t.t_id, team->t.t_pkfn));
2230   } // END of timer KMP_fork_call block
2231 
2232 #if KMP_STATS_ENABLED
2233   // If beginning a teams construct, then change thread state
2234   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235   if (!ap) {
2236     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237   }
2238 #endif
2239 
2240   if (!team->t.t_invoke(gtid)) {
2241     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242   }
2243 
2244 #if KMP_STATS_ENABLED
2245   // If was beginning of a teams construct, then reset thread state
2246   if (!ap) {
2247     KMP_SET_THREAD_STATE(previous_state);
2248   }
2249 #endif
2250 
2251   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252                 team->t.t_id, team->t.t_pkfn));
2253   KMP_MB(); /* Flush all pending memory write invalidates.  */
2254 
2255   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256 
2257 #if OMPT_SUPPORT
2258   if (ompt_enabled.enabled) {
2259     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260   }
2261 #endif
2262 
2263   return TRUE;
2264 }
2265 
2266 #if OMPT_SUPPORT
2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268                                             kmp_team_t *team) {
2269   // restore state outside the region
2270   thread->th.ompt_thread_info.state =
2271       ((team->t.t_serialized) ? ompt_state_work_serial
2272                               : ompt_state_work_parallel);
2273 }
2274 
2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276                                    kmp_team_t *team, ompt_data_t *parallel_data,
2277                                    int flags, void *codeptr) {
2278   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279   if (ompt_enabled.ompt_callback_parallel_end) {
2280     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281         parallel_data, &(task_info->task_data), flags, codeptr);
2282   }
2283 
2284   task_info->frame.enter_frame = ompt_data_none;
2285   __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288 
2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291                      ,
2292                      enum fork_context_e fork_context
2293 #endif
2294                      ,
2295                      int exit_teams) {
2296   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297   kmp_team_t *team;
2298   kmp_team_t *parent_team;
2299   kmp_info_t *master_th;
2300   kmp_root_t *root;
2301   int master_active;
2302 
2303   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305   /* setup current data */
2306   master_th = __kmp_threads[gtid];
2307   root = master_th->th.th_root;
2308   team = master_th->th.th_team;
2309   parent_team = team->t.t_parent;
2310 
2311   master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314   void *team_microtask = (void *)team->t.t_pkfn;
2315   // For GOMP interface with serialized parallel, need the
2316   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317   // and end-parallel events.
2318   if (ompt_enabled.enabled &&
2319       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321   }
2322 #endif
2323 
2324 #if KMP_DEBUG
2325   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327                   "th_task_team = %p\n",
2328                   __kmp_gtid_from_thread(master_th), team,
2329                   team->t.t_task_team[master_th->th.th_task_state],
2330                   master_th->th.th_task_team));
2331     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332                      team->t.t_task_team[master_th->th.th_task_state]);
2333   }
2334 #endif
2335 
2336   if (team->t.t_serialized) {
2337     if (master_th->th.th_teams_microtask) {
2338       // We are in teams construct
2339       int level = team->t.t_level;
2340       int tlevel = master_th->th.th_teams_level;
2341       if (level == tlevel) {
2342         // AC: we haven't incremented it earlier at start of teams construct,
2343         //     so do it here - at the end of teams construct
2344         team->t.t_level++;
2345       } else if (level == tlevel + 1) {
2346         // AC: we are exiting parallel inside teams, need to increment
2347         // serialization in order to restore it in the next call to
2348         // __kmpc_end_serialized_parallel
2349         team->t.t_serialized++;
2350       }
2351     }
2352     __kmpc_end_serialized_parallel(loc, gtid);
2353 
2354 #if OMPT_SUPPORT
2355     if (ompt_enabled.enabled) {
2356       __kmp_join_restore_state(master_th, parent_team);
2357     }
2358 #endif
2359 
2360     return;
2361   }
2362 
2363   master_active = team->t.t_master_active;
2364 
2365   if (!exit_teams) {
2366     // AC: No barrier for internal teams at exit from teams construct.
2367     //     But there is barrier for external team (league).
2368     __kmp_internal_join(loc, gtid, team);
2369   } else {
2370     master_th->th.th_task_state =
2371         0; // AC: no tasking in teams (out of any parallel)
2372   }
2373 
2374   KMP_MB();
2375 
2376 #if OMPT_SUPPORT
2377   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378   void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380 
2381 #if USE_ITT_BUILD
2382   if (__itt_stack_caller_create_ptr) {
2383     // destroy the stack stitching id after join barrier
2384     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385   }
2386   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387   if (team->t.t_active_level == 1 &&
2388       (!master_th->th.th_teams_microtask || /* not in teams construct */
2389        master_th->th.th_teams_size.nteams == 1)) {
2390     master_th->th.th_ident = loc;
2391     // only one notification scheme (either "submit" or "forking/joined", not
2392     // both)
2393     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394         __kmp_forkjoin_frames_mode == 3)
2395       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396                              master_th->th.th_frame_time, 0, loc,
2397                              master_th->th.th_team_nproc, 1);
2398     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400       __kmp_itt_region_joined(gtid);
2401   } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404   if (master_th->th.th_teams_microtask && !exit_teams &&
2405       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406       team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411     ompt_data_t ompt_parallel_data = ompt_data_none;
2412     if (ompt_enabled.enabled) {
2413       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414       if (ompt_enabled.ompt_callback_implicit_task) {
2415         int ompt_team_size = team->t.t_nproc;
2416         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419       }
2420       task_info->frame.exit_frame = ompt_data_none;
2421       task_info->task_data = ompt_data_none;
2422       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423       __ompt_lw_taskteam_unlink(master_th);
2424     }
2425 #endif
2426     /* Decrement our nested depth level */
2427     team->t.t_level--;
2428     team->t.t_active_level--;
2429     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430 
2431     // Restore number of threads in the team if needed. This code relies on
2432     // the proper adjustment of th_teams_size.nth after the fork in
2433     // __kmp_teams_master on each teams master in the case that
2434     // __kmp_reserve_threads reduced it.
2435     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436       int old_num = master_th->th.th_team_nproc;
2437       int new_num = master_th->th.th_teams_size.nth;
2438       kmp_info_t **other_threads = team->t.t_threads;
2439       team->t.t_nproc = new_num;
2440       for (int i = 0; i < old_num; ++i) {
2441         other_threads[i]->th.th_team_nproc = new_num;
2442       }
2443       // Adjust states of non-used threads of the team
2444       for (int i = old_num; i < new_num; ++i) {
2445         // Re-initialize thread's barrier data.
2446         KMP_DEBUG_ASSERT(other_threads[i]);
2447         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448         for (int b = 0; b < bs_last_barrier; ++b) {
2449           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454         }
2455         if (__kmp_tasking_mode != tskm_immediate_exec) {
2456           // Synchronize thread's task state
2457           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458         }
2459       }
2460     }
2461 
2462 #if OMPT_SUPPORT
2463     if (ompt_enabled.enabled) {
2464       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466     }
2467 #endif
2468 
2469     return;
2470   }
2471 
2472   /* do cleanup and restore the parent team */
2473   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475 
2476   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477 
2478   /* jc: The following lock has instructions with REL and ACQ semantics,
2479      separating the parallel user code called in this parallel region
2480      from the serial user code called after this function returns. */
2481   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482 
2483   if (!master_th->th.th_teams_microtask ||
2484       team->t.t_level > master_th->th.th_teams_level) {
2485     /* Decrement our nested depth level */
2486     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int flags = (team_microtask == (void *)__kmp_teams_master)
2495                       ? ompt_task_initial
2496                       : ompt_task_implicit;
2497       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501     }
2502     task_info->frame.exit_frame = ompt_data_none;
2503     task_info->task_data = ompt_data_none;
2504   }
2505 #endif
2506 
2507   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508                 master_th, team));
2509   __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if KMP_AFFINITY_SUPPORTED
2512   // Restore master thread's partition.
2513   master_th->th.th_first_place = team->t.t_first_place;
2514   master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516   master_th->th.th_def_allocator = team->t.t_def_allocator;
2517 
2518   updateHWFPControl(team);
2519 
2520   if (root->r.r_active != master_active)
2521     root->r.r_active = master_active;
2522 
2523   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524                             master_th)); // this will free worker threads
2525 
2526   /* this race was fun to find. make sure the following is in the critical
2527      region otherwise assertions may fail occasionally since the old team may be
2528      reallocated and the hierarchy appears inconsistent. it is actually safe to
2529      run and won't cause any bugs, but will cause those assertion failures. it's
2530      only one deref&assign so might as well put this in the critical region */
2531   master_th->th.th_team = parent_team;
2532   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533   master_th->th.th_team_master = parent_team->t.t_threads[0];
2534   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535 
2536   /* restore serialized team, if need be */
2537   if (parent_team->t.t_serialized &&
2538       parent_team != master_th->th.th_serial_team &&
2539       parent_team != root->r.r_root_team) {
2540     __kmp_free_team(root,
2541                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542     master_th->th.th_serial_team = parent_team;
2543   }
2544 
2545   if (__kmp_tasking_mode != tskm_immediate_exec) {
2546     if (master_th->th.th_task_state_top >
2547         0) { // Restore task state from memo stack
2548       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549       // Remember master's state if we re-use this nested hot team
2550       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551           master_th->th.th_task_state;
2552       --master_th->th.th_task_state_top; // pop
2553       // Now restore state at this level
2554       master_th->th.th_task_state =
2555           master_th->th
2556               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557     }
2558     // Copy the task team from the parent team to the master thread
2559     master_th->th.th_task_team =
2560         parent_team->t.t_task_team[master_th->th.th_task_state];
2561     KA_TRACE(20,
2562              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564               parent_team));
2565   }
2566 
2567   // TODO: GEH - cannot do this assertion because root thread not set up as
2568   // executing
2569   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570   master_th->th.th_current_task->td_flags.executing = 1;
2571 
2572   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573 
2574 #if OMPT_SUPPORT
2575   int flags =
2576       OMPT_INVOKER(fork_context) |
2577       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578                                                       : ompt_parallel_team);
2579   if (ompt_enabled.enabled) {
2580     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581                     codeptr);
2582   }
2583 #endif
2584 
2585   KMP_MB();
2586   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588 
2589 /* Check whether we should push an internal control record onto the
2590    serial team stack.  If so, do it.  */
2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592 
2593   if (thread->th.th_team != thread->th.th_serial_team) {
2594     return;
2595   }
2596   if (thread->th.th_team->t.t_serialized > 1) {
2597     int push = 0;
2598 
2599     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600       push = 1;
2601     } else {
2602       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603           thread->th.th_team->t.t_serialized) {
2604         push = 1;
2605       }
2606     }
2607     if (push) { /* push a record on the serial team's stack */
2608       kmp_internal_control_t *control =
2609           (kmp_internal_control_t *)__kmp_allocate(
2610               sizeof(kmp_internal_control_t));
2611 
2612       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613 
2614       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615 
2616       control->next = thread->th.th_team->t.t_control_stack_top;
2617       thread->th.th_team->t.t_control_stack_top = control;
2618     }
2619   }
2620 }
2621 
2622 /* Changes set_nproc */
2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624   kmp_info_t *thread;
2625   kmp_root_t *root;
2626 
2627   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628   KMP_DEBUG_ASSERT(__kmp_init_serial);
2629 
2630   if (new_nth < 1)
2631     new_nth = 1;
2632   else if (new_nth > __kmp_max_nth)
2633     new_nth = __kmp_max_nth;
2634 
2635   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636   thread = __kmp_threads[gtid];
2637   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638     return; // nothing to do
2639 
2640   __kmp_save_internal_controls(thread);
2641 
2642   set__nproc(thread, new_nth);
2643 
2644   // If this omp_set_num_threads() call will cause the hot team size to be
2645   // reduced (in the absence of a num_threads clause), then reduce it now,
2646   // rather than waiting for the next parallel region.
2647   root = thread->th.th_root;
2648   if (__kmp_init_parallel && (!root->r.r_active) &&
2649       (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653       ) {
2654     kmp_team_t *hot_team = root->r.r_hot_team;
2655     int f;
2656 
2657     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659     // Release the extra threads we don't need any more.
2660     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662       if (__kmp_tasking_mode != tskm_immediate_exec) {
2663         // When decreasing team size, threads no longer in the team should unref
2664         // task team.
2665         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666       }
2667       __kmp_free_thread(hot_team->t.t_threads[f]);
2668       hot_team->t.t_threads[f] = NULL;
2669     }
2670     hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672     if (thread->th.th_hot_teams) {
2673       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675     }
2676 #endif
2677 
2678     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679 
2680     // Update the t_nproc field in the threads that are still active.
2681     for (f = 0; f < new_nth; f++) {
2682       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684     }
2685     // Special flag in case omp_set_num_threads() call
2686     hot_team->t.t_size_changed = -1;
2687   }
2688 }
2689 
2690 /* Changes max_active_levels */
2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692   kmp_info_t *thread;
2693 
2694   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695                 "%d = (%d)\n",
2696                 gtid, max_active_levels));
2697   KMP_DEBUG_ASSERT(__kmp_init_serial);
2698 
2699   // validate max_active_levels
2700   if (max_active_levels < 0) {
2701     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702     // We ignore this call if the user has specified a negative value.
2703     // The current setting won't be changed. The last valid setting will be
2704     // used. A warning will be issued (if warnings are allowed as controlled by
2705     // the KMP_WARNINGS env var).
2706     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707                   "max_active_levels for thread %d = (%d)\n",
2708                   gtid, max_active_levels));
2709     return;
2710   }
2711   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712     // it's OK, the max_active_levels is within the valid range: [ 0;
2713     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714     // We allow a zero value. (implementation defined behavior)
2715   } else {
2716     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719     // Current upper limit is MAX_INT. (implementation defined behavior)
2720     // If the input exceeds the upper limit, we correct the input to be the
2721     // upper limit. (implementation defined behavior)
2722     // Actually, the flow should never get here until we use MAX_INT limit.
2723   }
2724   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725                 "max_active_levels for thread %d = (%d)\n",
2726                 gtid, max_active_levels));
2727 
2728   thread = __kmp_threads[gtid];
2729 
2730   __kmp_save_internal_controls(thread);
2731 
2732   set__max_active_levels(thread, max_active_levels);
2733 }
2734 
2735 /* Gets max_active_levels */
2736 int __kmp_get_max_active_levels(int gtid) {
2737   kmp_info_t *thread;
2738 
2739   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740   KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742   thread = __kmp_threads[gtid];
2743   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745                 "curtask_maxaclevel=%d\n",
2746                 gtid, thread->th.th_current_task,
2747                 thread->th.th_current_task->td_icvs.max_active_levels));
2748   return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750 
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753 
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756   kmp_info_t *thread;
2757   kmp_sched_t orig_kind;
2758   //    kmp_team_t *team;
2759 
2760   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761                 gtid, (int)kind, chunk));
2762   KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764   // Check if the kind parameter is valid, correct if needed.
2765   // Valid parameters should fit in one of two intervals - standard or extended:
2766   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2768   orig_kind = kind;
2769   kind = __kmp_sched_without_mods(kind);
2770 
2771   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773     // TODO: Hint needs attention in case we change the default schedule.
2774     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776               __kmp_msg_null);
2777     kind = kmp_sched_default;
2778     chunk = 0; // ignore chunk value in case of bad kind
2779   }
2780 
2781   thread = __kmp_threads[gtid];
2782 
2783   __kmp_save_internal_controls(thread);
2784 
2785   if (kind < kmp_sched_upper_std) {
2786     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787       // differ static chunked vs. unchunked:  chunk should be invalid to
2788       // indicate unchunked schedule (which is the default)
2789       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790     } else {
2791       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792           __kmp_sch_map[kind - kmp_sched_lower - 1];
2793     }
2794   } else {
2795     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796     //    kmp_sched_lower - 2 ];
2797     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799                       kmp_sched_lower - 2];
2800   }
2801   __kmp_sched_apply_mods_intkind(
2802       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803   if (kind == kmp_sched_auto || chunk < 1) {
2804     // ignore parameter chunk for schedule auto
2805     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806   } else {
2807     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808   }
2809 }
2810 
2811 /* Gets def_sched_var ICV values */
2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813   kmp_info_t *thread;
2814   enum sched_type th_type;
2815 
2816   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817   KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819   thread = __kmp_threads[gtid];
2820 
2821   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823   case kmp_sch_static:
2824   case kmp_sch_static_greedy:
2825   case kmp_sch_static_balanced:
2826     *kind = kmp_sched_static;
2827     __kmp_sched_apply_mods_stdkind(kind, th_type);
2828     *chunk = 0; // chunk was not set, try to show this fact via zero value
2829     return;
2830   case kmp_sch_static_chunked:
2831     *kind = kmp_sched_static;
2832     break;
2833   case kmp_sch_dynamic_chunked:
2834     *kind = kmp_sched_dynamic;
2835     break;
2836   case kmp_sch_guided_chunked:
2837   case kmp_sch_guided_iterative_chunked:
2838   case kmp_sch_guided_analytical_chunked:
2839     *kind = kmp_sched_guided;
2840     break;
2841   case kmp_sch_auto:
2842     *kind = kmp_sched_auto;
2843     break;
2844   case kmp_sch_trapezoidal:
2845     *kind = kmp_sched_trapezoidal;
2846     break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848   case kmp_sch_static_steal:
2849     *kind = kmp_sched_static_steal;
2850     break;
2851 #endif
2852   default:
2853     KMP_FATAL(UnknownSchedulingType, th_type);
2854   }
2855 
2856   __kmp_sched_apply_mods_stdkind(kind, th_type);
2857   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859 
2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861 
2862   int ii, dd;
2863   kmp_team_t *team;
2864   kmp_info_t *thr;
2865 
2866   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867   KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869   // validate level
2870   if (level == 0)
2871     return 0;
2872   if (level < 0)
2873     return -1;
2874   thr = __kmp_threads[gtid];
2875   team = thr->th.th_team;
2876   ii = team->t.t_level;
2877   if (level > ii)
2878     return -1;
2879 
2880   if (thr->th.th_teams_microtask) {
2881     // AC: we are in teams region where multiple nested teams have same level
2882     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883     if (level <=
2884         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885       KMP_DEBUG_ASSERT(ii >= tlevel);
2886       // AC: As we need to pass by the teams league, we need to artificially
2887       // increase ii
2888       if (ii == tlevel) {
2889         ii += 2; // three teams have same level
2890       } else {
2891         ii++; // two teams have same level
2892       }
2893     }
2894   }
2895 
2896   if (ii == level)
2897     return __kmp_tid_from_gtid(gtid);
2898 
2899   dd = team->t.t_serialized;
2900   level++;
2901   while (ii > level) {
2902     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903     }
2904     if ((team->t.t_serialized) && (!dd)) {
2905       team = team->t.t_parent;
2906       continue;
2907     }
2908     if (ii > level) {
2909       team = team->t.t_parent;
2910       dd = team->t.t_serialized;
2911       ii--;
2912     }
2913   }
2914 
2915   return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917 
2918 int __kmp_get_team_size(int gtid, int level) {
2919 
2920   int ii, dd;
2921   kmp_team_t *team;
2922   kmp_info_t *thr;
2923 
2924   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925   KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927   // validate level
2928   if (level == 0)
2929     return 1;
2930   if (level < 0)
2931     return -1;
2932   thr = __kmp_threads[gtid];
2933   team = thr->th.th_team;
2934   ii = team->t.t_level;
2935   if (level > ii)
2936     return -1;
2937 
2938   if (thr->th.th_teams_microtask) {
2939     // AC: we are in teams region where multiple nested teams have same level
2940     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941     if (level <=
2942         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943       KMP_DEBUG_ASSERT(ii >= tlevel);
2944       // AC: As we need to pass by the teams league, we need to artificially
2945       // increase ii
2946       if (ii == tlevel) {
2947         ii += 2; // three teams have same level
2948       } else {
2949         ii++; // two teams have same level
2950       }
2951     }
2952   }
2953 
2954   while (ii > level) {
2955     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956     }
2957     if (team->t.t_serialized && (!dd)) {
2958       team = team->t.t_parent;
2959       continue;
2960     }
2961     if (ii > level) {
2962       team = team->t.t_parent;
2963       ii--;
2964     }
2965   }
2966 
2967   return team->t.t_nproc;
2968 }
2969 
2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973   // independently. So one can get the updated schedule here.
2974 
2975   kmp_r_sched_t r_sched;
2976 
2977   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980   // different roots (even in OMP 2.5)
2981   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983   if (s == kmp_sch_static) {
2984     // replace STATIC with more detailed schedule (balanced or greedy)
2985     r_sched.r_sched_type = __kmp_static;
2986   } else if (s == kmp_sch_guided_chunked) {
2987     // replace GUIDED with more detailed schedule (iterative or analytical)
2988     r_sched.r_sched_type = __kmp_guided;
2989   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990     r_sched.r_sched_type = __kmp_sched;
2991   }
2992   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993 
2994   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995     // __kmp_chunk may be wrong here (if it was not ever set)
2996     r_sched.chunk = KMP_DEFAULT_CHUNK;
2997   } else {
2998     r_sched.chunk = __kmp_chunk;
2999   }
3000 
3001   return r_sched;
3002 }
3003 
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005    at least argc number of *t_argv entries for the requested team. */
3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007 
3008   KMP_DEBUG_ASSERT(team);
3009   if (!realloc || argc > team->t.t_max_argc) {
3010 
3011     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012                    "current entries=%d\n",
3013                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014     /* if previously allocated heap space for args, free them */
3015     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016       __kmp_free((void *)team->t.t_argv);
3017 
3018     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019       /* use unused space in the cache line for arguments */
3020       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022                      "argv entries\n",
3023                      team->t.t_id, team->t.t_max_argc));
3024       team->t.t_argv = &team->t.t_inline_argv[0];
3025       if (__kmp_storage_map) {
3026         __kmp_print_storage_map_gtid(
3027             -1, &team->t.t_inline_argv[0],
3028             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030             team->t.t_id);
3031       }
3032     } else {
3033       /* allocate space for arguments in the heap */
3034       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036                                : 2 * argc;
3037       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038                      "argv entries\n",
3039                      team->t.t_id, team->t.t_max_argc));
3040       team->t.t_argv =
3041           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042       if (__kmp_storage_map) {
3043         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044                                      &team->t.t_argv[team->t.t_max_argc],
3045                                      sizeof(void *) * team->t.t_max_argc,
3046                                      "team_%d.t_argv", team->t.t_id);
3047       }
3048     }
3049   }
3050 }
3051 
3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053   int i;
3054   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055   team->t.t_threads =
3056       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058       sizeof(dispatch_shared_info_t) * num_disp_buff);
3059   team->t.t_dispatch =
3060       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061   team->t.t_implicit_task_taskdata =
3062       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063   team->t.t_max_nproc = max_nth;
3064 
3065   /* setup dispatch buffers */
3066   for (i = 0; i < num_disp_buff; ++i) {
3067     team->t.t_disp_buffer[i].buffer_index = i;
3068     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069   }
3070 }
3071 
3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074   int i;
3075   for (i = 0; i < team->t.t_max_nproc; ++i) {
3076     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078       team->t.t_dispatch[i].th_disp_buffer = NULL;
3079     }
3080   }
3081 #if KMP_USE_HIER_SCHED
3082   __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084   __kmp_free(team->t.t_threads);
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   team->t.t_threads = NULL;
3089   team->t.t_disp_buffer = NULL;
3090   team->t.t_dispatch = NULL;
3091   team->t.t_implicit_task_taskdata = 0;
3092 }
3093 
3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095   kmp_info_t **oldThreads = team->t.t_threads;
3096 
3097   __kmp_free(team->t.t_disp_buffer);
3098   __kmp_free(team->t.t_dispatch);
3099   __kmp_free(team->t.t_implicit_task_taskdata);
3100   __kmp_allocate_team_arrays(team, max_nth);
3101 
3102   KMP_MEMCPY(team->t.t_threads, oldThreads,
3103              team->t.t_nproc * sizeof(kmp_info_t *));
3104 
3105   __kmp_free(oldThreads);
3106 }
3107 
3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109 
3110   kmp_r_sched_t r_sched =
3111       __kmp_get_schedule_global(); // get current state of scheduling globals
3112 
3113   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114 
3115   kmp_internal_control_t g_icvs = {
3116     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118     // adjustment of threads (per thread)
3119     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120     // whether blocktime is explicitly set
3121     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127     // next parallel region (per thread)
3128     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129     __kmp_cg_max_nth, // int thread_limit;
3130     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131     // for max_active_levels
3132     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133     // {sched,chunk} pair
3134     __kmp_nested_proc_bind.bind_types[0],
3135     __kmp_default_device,
3136     NULL // struct kmp_internal_control *next;
3137   };
3138 
3139   return g_icvs;
3140 }
3141 
3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143 
3144   kmp_internal_control_t gx_icvs;
3145   gx_icvs.serial_nesting_level =
3146       0; // probably =team->t.t_serial like in save_inter_controls
3147   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148   gx_icvs.next = NULL;
3149 
3150   return gx_icvs;
3151 }
3152 
3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154   int f;
3155   kmp_team_t *root_team;
3156   kmp_team_t *hot_team;
3157   int hot_team_max_nth;
3158   kmp_r_sched_t r_sched =
3159       __kmp_get_schedule_global(); // get current state of scheduling globals
3160   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161   KMP_DEBUG_ASSERT(root);
3162   KMP_ASSERT(!root->r.r_begin);
3163 
3164   /* setup the root state structure */
3165   __kmp_init_lock(&root->r.r_begin_lock);
3166   root->r.r_begin = FALSE;
3167   root->r.r_active = FALSE;
3168   root->r.r_in_parallel = 0;
3169   root->r.r_blocktime = __kmp_dflt_blocktime;
3170 
3171   /* setup the root team for this task */
3172   /* allocate the root team structure */
3173   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174 
3175   root_team =
3176       __kmp_allocate_team(root,
3177                           1, // new_nproc
3178                           1, // max_nproc
3179 #if OMPT_SUPPORT
3180                           ompt_data_none, // root parallel id
3181 #endif
3182                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183                           0 // argc
3184                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185                           );
3186 #if USE_DEBUGGER
3187   // Non-NULL value should be assigned to make the debugger display the root
3188   // team.
3189   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191 
3192   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193 
3194   root->r.r_root_team = root_team;
3195   root_team->t.t_control_stack_top = NULL;
3196 
3197   /* initialize root team */
3198   root_team->t.t_threads[0] = NULL;
3199   root_team->t.t_nproc = 1;
3200   root_team->t.t_serialized = 1;
3201   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202   root_team->t.t_sched.sched = r_sched.sched;
3203   KA_TRACE(
3204       20,
3205       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207 
3208   /* setup the  hot team for this task */
3209   /* allocate the hot team structure */
3210   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211 
3212   hot_team =
3213       __kmp_allocate_team(root,
3214                           1, // new_nproc
3215                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217                           ompt_data_none, // root parallel id
3218 #endif
3219                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220                           0 // argc
3221                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222                           );
3223   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224 
3225   root->r.r_hot_team = hot_team;
3226   root_team->t.t_control_stack_top = NULL;
3227 
3228   /* first-time initialization */
3229   hot_team->t.t_parent = root_team;
3230 
3231   /* initialize hot team */
3232   hot_team_max_nth = hot_team->t.t_max_nproc;
3233   for (f = 0; f < hot_team_max_nth; ++f) {
3234     hot_team->t.t_threads[f] = NULL;
3235   }
3236   hot_team->t.t_nproc = 1;
3237   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238   hot_team->t.t_sched.sched = r_sched.sched;
3239   hot_team->t.t_size_changed = 0;
3240 }
3241 
3242 #ifdef KMP_DEBUG
3243 
3244 typedef struct kmp_team_list_item {
3245   kmp_team_p const *entry;
3246   struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249 
3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251     kmp_team_list_t list, // List of teams.
3252     kmp_team_p const *team // Team to add.
3253     ) {
3254 
3255   // List must terminate with item where both entry and next are NULL.
3256   // Team is added to the list only once.
3257   // List is sorted in ascending order by team id.
3258   // Team id is *not* a key.
3259 
3260   kmp_team_list_t l;
3261 
3262   KMP_DEBUG_ASSERT(list != NULL);
3263   if (team == NULL) {
3264     return;
3265   }
3266 
3267   __kmp_print_structure_team_accum(list, team->t.t_parent);
3268   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269 
3270   // Search list for the team.
3271   l = list;
3272   while (l->next != NULL && l->entry != team) {
3273     l = l->next;
3274   }
3275   if (l->next != NULL) {
3276     return; // Team has been added before, exit.
3277   }
3278 
3279   // Team is not found. Search list again for insertion point.
3280   l = list;
3281   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282     l = l->next;
3283   }
3284 
3285   // Insert team.
3286   {
3287     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288         sizeof(kmp_team_list_item_t));
3289     *item = *l;
3290     l->entry = team;
3291     l->next = item;
3292   }
3293 }
3294 
3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296 
3297                                        ) {
3298   __kmp_printf("%s", title);
3299   if (team != NULL) {
3300     __kmp_printf("%2x %p\n", team->t.t_id, team);
3301   } else {
3302     __kmp_printf(" - (nil)\n");
3303   }
3304 }
3305 
3306 static void __kmp_print_structure_thread(char const *title,
3307                                          kmp_info_p const *thread) {
3308   __kmp_printf("%s", title);
3309   if (thread != NULL) {
3310     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311   } else {
3312     __kmp_printf(" - (nil)\n");
3313   }
3314 }
3315 
3316 void __kmp_print_structure(void) {
3317 
3318   kmp_team_list_t list;
3319 
3320   // Initialize list of teams.
3321   list =
3322       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323   list->entry = NULL;
3324   list->next = NULL;
3325 
3326   __kmp_printf("\n------------------------------\nGlobal Thread "
3327                "Table\n------------------------------\n");
3328   {
3329     int gtid;
3330     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331       __kmp_printf("%2d", gtid);
3332       if (__kmp_threads != NULL) {
3333         __kmp_printf(" %p", __kmp_threads[gtid]);
3334       }
3335       if (__kmp_root != NULL) {
3336         __kmp_printf(" %p", __kmp_root[gtid]);
3337       }
3338       __kmp_printf("\n");
3339     }
3340   }
3341 
3342   // Print out __kmp_threads array.
3343   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344                "----------\n");
3345   if (__kmp_threads != NULL) {
3346     int gtid;
3347     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348       kmp_info_t const *thread = __kmp_threads[gtid];
3349       if (thread != NULL) {
3350         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3352         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3353         __kmp_print_structure_team("    Serial Team:  ",
3354                                    thread->th.th_serial_team);
3355         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3356         __kmp_print_structure_thread("    Master:       ",
3357                                      thread->th.th_team_master);
3358         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3359         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3360         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361         __kmp_print_structure_thread("    Next in pool: ",
3362                                      thread->th.th_next_pool);
3363         __kmp_printf("\n");
3364         __kmp_print_structure_team_accum(list, thread->th.th_team);
3365         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366       }
3367     }
3368   } else {
3369     __kmp_printf("Threads array is not allocated.\n");
3370   }
3371 
3372   // Print out __kmp_root array.
3373   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374                "--------\n");
3375   if (__kmp_root != NULL) {
3376     int gtid;
3377     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378       kmp_root_t const *root = __kmp_root[gtid];
3379       if (root != NULL) {
3380         __kmp_printf("GTID %2d %p:\n", gtid, root);
3381         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3382         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3383         __kmp_print_structure_thread("    Uber Thread:  ",
3384                                      root->r.r_uber_thread);
3385         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3386         __kmp_printf("    In Parallel:  %2d\n",
3387                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388         __kmp_printf("\n");
3389         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391       }
3392     }
3393   } else {
3394     __kmp_printf("Ubers array is not allocated.\n");
3395   }
3396 
3397   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398                "--------\n");
3399   while (list->next != NULL) {
3400     kmp_team_p const *team = list->entry;
3401     int i;
3402     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3404     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3405     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3406     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3407     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3408     for (i = 0; i < team->t.t_nproc; ++i) {
3409       __kmp_printf("    Thread %2d:      ", i);
3410       __kmp_print_structure_thread("", team->t.t_threads[i]);
3411     }
3412     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3413     __kmp_printf("\n");
3414     list = list->next;
3415   }
3416 
3417   // Print out __kmp_thread_pool and __kmp_team_pool.
3418   __kmp_printf("\n------------------------------\nPools\n----------------------"
3419                "--------\n");
3420   __kmp_print_structure_thread("Thread pool:          ",
3421                                CCAST(kmp_info_t *, __kmp_thread_pool));
3422   __kmp_print_structure_team("Team pool:            ",
3423                              CCAST(kmp_team_t *, __kmp_team_pool));
3424   __kmp_printf("\n");
3425 
3426   // Free team list.
3427   while (list != NULL) {
3428     kmp_team_list_item_t *item = list;
3429     list = list->next;
3430     KMP_INTERNAL_FREE(item);
3431   }
3432 }
3433 
3434 #endif
3435 
3436 //---------------------------------------------------------------------------
3437 //  Stuff for per-thread fast random number generator
3438 //  Table of primes
3439 static const unsigned __kmp_primes[] = {
3440     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451 
3452 //---------------------------------------------------------------------------
3453 //  __kmp_get_random: Get a random number using a linear congruential method.
3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455   unsigned x = thread->th.th_x;
3456   unsigned short r = x >> 16;
3457 
3458   thread->th.th_x = x * thread->th.th_a + 1;
3459 
3460   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461                 thread->th.th_info.ds.ds_tid, r));
3462 
3463   return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
3467 void __kmp_init_random(kmp_info_t *thread) {
3468   unsigned seed = thread->th.th_info.ds.ds_tid;
3469 
3470   thread->th.th_a =
3471       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473   KA_TRACE(30,
3474            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476 
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479  * reclaimed */
3480 static int __kmp_reclaim_dead_roots(void) {
3481   int i, r = 0;
3482 
3483   for (i = 0; i < __kmp_threads_capacity; ++i) {
3484     if (KMP_UBER_GTID(i) &&
3485         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486         !__kmp_root[i]
3487              ->r.r_active) { // AC: reclaim only roots died in non-active state
3488       r += __kmp_unregister_root_other_thread(i);
3489     }
3490   }
3491   return r;
3492 }
3493 #endif
3494 
3495 /* This function attempts to create free entries in __kmp_threads and
3496    __kmp_root, and returns the number of free entries generated.
3497 
3498    For Windows* OS static library, the first mechanism used is to reclaim array
3499    entries for root threads that are already dead.
3500 
3501    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504    threadprivate cache array has been created. Synchronization with
3505    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506 
3507    After any dead root reclamation, if the clipping value allows array expansion
3508    to result in the generation of a total of nNeed free slots, the function does
3509    that expansion. If not, nothing is done beyond the possible initial root
3510    thread reclamation.
3511 
3512    If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nNeed) {
3514   int added = 0;
3515   int minimumRequiredCapacity;
3516   int newCapacity;
3517   kmp_info_t **newThreads;
3518   kmp_root_t **newRoot;
3519 
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523 
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525   /* only for Windows static library */
3526   /* reclaim array entries for root threads that are already dead */
3527   added = __kmp_reclaim_dead_roots();
3528 
3529   if (nNeed) {
3530     nNeed -= added;
3531     if (nNeed < 0)
3532       nNeed = 0;
3533   }
3534 #endif
3535   if (nNeed <= 0)
3536     return added;
3537 
3538   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541   // > __kmp_max_nth in one of two ways:
3542   //
3543   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3544   //    may not be reused by another thread, so we may need to increase
3545   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3546   //
3547   // 2) New foreign root(s) are encountered.  We always register new foreign
3548   //    roots. This may cause a smaller # of threads to be allocated at
3549   //    subsequent parallel regions, but the worker threads hang around (and
3550   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3551   //
3552   // Anyway, that is the reason for moving the check to see if
3553   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554   // instead of having it performed here. -BB
3555 
3556   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557 
3558   /* compute expansion headroom to check if we can expand */
3559   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560     /* possible expansion too small -- give up */
3561     return added;
3562   }
3563   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564 
3565   newCapacity = __kmp_threads_capacity;
3566   do {
3567     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568                                                           : __kmp_sys_max_nth;
3569   } while (newCapacity < minimumRequiredCapacity);
3570   newThreads = (kmp_info_t **)__kmp_allocate(
3571       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572   newRoot =
3573       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574   KMP_MEMCPY(newThreads, __kmp_threads,
3575              __kmp_threads_capacity * sizeof(kmp_info_t *));
3576   KMP_MEMCPY(newRoot, __kmp_root,
3577              __kmp_threads_capacity * sizeof(kmp_root_t *));
3578 
3579   kmp_info_t **temp_threads = __kmp_threads;
3580   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582   __kmp_free(temp_threads);
3583   added += newCapacity - __kmp_threads_capacity;
3584   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585 
3586   if (newCapacity > __kmp_tp_capacity) {
3587     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589       __kmp_threadprivate_resize_cache(newCapacity);
3590     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592     }
3593     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594   }
3595 
3596   return added;
3597 }
3598 
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601    thread that calls from __kmp_do_serial_initialize() */
3602 int __kmp_register_root(int initial_thread) {
3603   kmp_info_t *root_thread;
3604   kmp_root_t *root;
3605   int gtid;
3606   int capacity;
3607   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609   KMP_MB();
3610 
3611   /* 2007-03-02:
3612      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614      work as expected -- it may return false (that means there is at least one
3615      empty slot in __kmp_threads array), but it is possible the only free slot
3616      is #0, which is reserved for initial thread and so cannot be used for this
3617      one. Following code workarounds this bug.
3618 
3619      However, right solution seems to be not reserving slot #0 for initial
3620      thread because:
3621      (1) there is no magic in slot #0,
3622      (2) we cannot detect initial thread reliably (the first thread which does
3623         serial initialization may be not a real initial thread).
3624   */
3625   capacity = __kmp_threads_capacity;
3626   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627     --capacity;
3628   }
3629 
3630   /* see if there are too many threads */
3631   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632     if (__kmp_tp_cached) {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636     } else {
3637       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638                   __kmp_msg_null);
3639     }
3640   }
3641 
3642   /* find an available thread slot */
3643   /* Don't reassign the zero slot since we need that to only be used by initial
3644      thread */
3645   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3646        gtid++)
3647     ;
3648   KA_TRACE(1,
3649            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3650   KMP_ASSERT(gtid < __kmp_threads_capacity);
3651 
3652   /* update global accounting */
3653   __kmp_all_nth++;
3654   TCW_4(__kmp_nth, __kmp_nth + 1);
3655 
3656   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3657   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3658   if (__kmp_adjust_gtid_mode) {
3659     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3660       if (TCR_4(__kmp_gtid_mode) != 2) {
3661         TCW_4(__kmp_gtid_mode, 2);
3662       }
3663     } else {
3664       if (TCR_4(__kmp_gtid_mode) != 1) {
3665         TCW_4(__kmp_gtid_mode, 1);
3666       }
3667     }
3668   }
3669 
3670 #ifdef KMP_ADJUST_BLOCKTIME
3671   /* Adjust blocktime to zero if necessary            */
3672   /* Middle initialization might not have occurred yet */
3673   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3674     if (__kmp_nth > __kmp_avail_proc) {
3675       __kmp_zero_bt = TRUE;
3676     }
3677   }
3678 #endif /* KMP_ADJUST_BLOCKTIME */
3679 
3680   /* setup this new hierarchy */
3681   if (!(root = __kmp_root[gtid])) {
3682     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3683     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3684   }
3685 
3686 #if KMP_STATS_ENABLED
3687   // Initialize stats as soon as possible (right after gtid assignment).
3688   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3689   __kmp_stats_thread_ptr->startLife();
3690   KMP_SET_THREAD_STATE(SERIAL_REGION);
3691   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3692 #endif
3693   __kmp_initialize_root(root);
3694 
3695   /* setup new root thread structure */
3696   if (root->r.r_uber_thread) {
3697     root_thread = root->r.r_uber_thread;
3698   } else {
3699     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3700     if (__kmp_storage_map) {
3701       __kmp_print_thread_storage_map(root_thread, gtid);
3702     }
3703     root_thread->th.th_info.ds.ds_gtid = gtid;
3704 #if OMPT_SUPPORT
3705     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3706 #endif
3707     root_thread->th.th_root = root;
3708     if (__kmp_env_consistency_check) {
3709       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3710     }
3711 #if USE_FAST_MEMORY
3712     __kmp_initialize_fast_memory(root_thread);
3713 #endif /* USE_FAST_MEMORY */
3714 
3715 #if KMP_USE_BGET
3716     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3717     __kmp_initialize_bget(root_thread);
3718 #endif
3719     __kmp_init_random(root_thread); // Initialize random number generator
3720   }
3721 
3722   /* setup the serial team held in reserve by the root thread */
3723   if (!root_thread->th.th_serial_team) {
3724     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3725     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3726     root_thread->th.th_serial_team = __kmp_allocate_team(
3727         root, 1, 1,
3728 #if OMPT_SUPPORT
3729         ompt_data_none, // root parallel id
3730 #endif
3731         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3732   }
3733   KMP_ASSERT(root_thread->th.th_serial_team);
3734   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3735                 root_thread->th.th_serial_team));
3736 
3737   /* drop root_thread into place */
3738   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739 
3740   root->r.r_root_team->t.t_threads[0] = root_thread;
3741   root->r.r_hot_team->t.t_threads[0] = root_thread;
3742   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743   // AC: the team created in reserve, not for execution (it is unused for now).
3744   root_thread->th.th_serial_team->t.t_serialized = 0;
3745   root->r.r_uber_thread = root_thread;
3746 
3747   /* initialize the thread, get it ready to go */
3748   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3749   TCW_4(__kmp_init_gtid, TRUE);
3750 
3751   /* prepare the master thread for get_gtid() */
3752   __kmp_gtid_set_specific(gtid);
3753 
3754 #if USE_ITT_BUILD
3755   __kmp_itt_thread_name(gtid);
3756 #endif /* USE_ITT_BUILD */
3757 
3758 #ifdef KMP_TDATA_GTID
3759   __kmp_gtid = gtid;
3760 #endif
3761   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3762   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3763 
3764   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3765                 "plain=%u\n",
3766                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3767                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3768                 KMP_INIT_BARRIER_STATE));
3769   { // Initialize barrier data.
3770     int b;
3771     for (b = 0; b < bs_last_barrier; ++b) {
3772       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3773 #if USE_DEBUGGER
3774       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3775 #endif
3776     }
3777   }
3778   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3779                    KMP_INIT_BARRIER_STATE);
3780 
3781 #if KMP_AFFINITY_SUPPORTED
3782   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786   if (TCR_4(__kmp_init_middle)) {
3787     __kmp_affinity_set_init_mask(gtid, TRUE);
3788   }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790   root_thread->th.th_def_allocator = __kmp_def_allocator;
3791   root_thread->th.th_prev_level = 0;
3792   root_thread->th.th_prev_num_threads = 1;
3793 
3794   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3795   tmp->cg_root = root_thread;
3796   tmp->cg_thread_limit = __kmp_cg_max_nth;
3797   tmp->cg_nthreads = 1;
3798   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3799                  " cg_nthreads init to 1\n",
3800                  root_thread, tmp));
3801   tmp->up = NULL;
3802   root_thread->th.th_cg_roots = tmp;
3803 
3804   __kmp_root_counter++;
3805 
3806 #if OMPT_SUPPORT
3807   if (!initial_thread && ompt_enabled.enabled) {
3808 
3809     kmp_info_t *root_thread = ompt_get_thread();
3810 
3811     ompt_set_thread_state(root_thread, ompt_state_overhead);
3812 
3813     if (ompt_enabled.ompt_callback_thread_begin) {
3814       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3815           ompt_thread_initial, __ompt_get_thread_data_internal());
3816     }
3817     ompt_data_t *task_data;
3818     ompt_data_t *parallel_data;
3819     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3820     if (ompt_enabled.ompt_callback_implicit_task) {
3821       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823     }
3824 
3825     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826   }
3827 #endif
3828 
3829   KMP_MB();
3830   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832   return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837                                 const int max_level) {
3838   int i, n, nth;
3839   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840   if (!hot_teams || !hot_teams[level].hot_team) {
3841     return 0;
3842   }
3843   KMP_DEBUG_ASSERT(level < max_level);
3844   kmp_team_t *team = hot_teams[level].hot_team;
3845   nth = hot_teams[level].hot_team_nth;
3846   n = nth - 1; // master is not freed
3847   if (level < max_level - 1) {
3848     for (i = 0; i < nth; ++i) {
3849       kmp_info_t *th = team->t.t_threads[i];
3850       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851       if (i > 0 && th->th.th_hot_teams) {
3852         __kmp_free(th->th.th_hot_teams);
3853         th->th.th_hot_teams = NULL;
3854       }
3855     }
3856   }
3857   __kmp_free_team(root, team, NULL);
3858   return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865   kmp_team_t *root_team = root->r.r_root_team;
3866   kmp_team_t *hot_team = root->r.r_hot_team;
3867   int n = hot_team->t.t_nproc;
3868   int i;
3869 
3870   KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872   root->r.r_root_team = NULL;
3873   root->r.r_hot_team = NULL;
3874   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875   // before call to __kmp_free_team().
3876   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878   if (__kmp_hot_teams_max_level >
3879       0) { // need to free nested hot teams and their threads if any
3880     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881       kmp_info_t *th = hot_team->t.t_threads[i];
3882       if (__kmp_hot_teams_max_level > 1) {
3883         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884       }
3885       if (th->th.th_hot_teams) {
3886         __kmp_free(th->th.th_hot_teams);
3887         th->th.th_hot_teams = NULL;
3888       }
3889     }
3890   }
3891 #endif
3892   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894   // Before we can reap the thread, we need to make certain that all other
3895   // threads in the teams that had this root as ancestor have stopped trying to
3896   // steal tasks.
3897   if (__kmp_tasking_mode != tskm_immediate_exec) {
3898     __kmp_wait_to_unref_task_teams();
3899   }
3900 
3901 #if KMP_OS_WINDOWS
3902   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903   KA_TRACE(
3904       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905            "\n",
3906            (LPVOID) & (root->r.r_uber_thread->th),
3907            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912   ompt_data_t *task_data;
3913   ompt_data_t *parallel_data;
3914   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3915   if (ompt_enabled.ompt_callback_implicit_task) {
3916     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3917         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3918   }
3919   if (ompt_enabled.ompt_callback_thread_end) {
3920     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3921         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3922   }
3923 #endif
3924 
3925   TCW_4(__kmp_nth,
3926         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3927   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3928   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3929                  " to %d\n",
3930                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3931                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3932   if (i == 1) {
3933     // need to free contention group structure
3934     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3935                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3936     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3937     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3938     root->r.r_uber_thread->th.th_cg_roots = NULL;
3939   }
3940   __kmp_reap_thread(root->r.r_uber_thread, 1);
3941 
3942   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3943   // instead of freeing.
3944   root->r.r_uber_thread = NULL;
3945   /* mark root as no longer in use */
3946   root->r.r_begin = FALSE;
3947 
3948   return n;
3949 }
3950 
3951 void __kmp_unregister_root_current_thread(int gtid) {
3952   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3953   /* this lock should be ok, since unregister_root_current_thread is never
3954      called during an abort, only during a normal close. furthermore, if you
3955      have the forkjoin lock, you should never try to get the initz lock */
3956   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3957   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3958     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3959                   "exiting T#%d\n",
3960                   gtid));
3961     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3962     return;
3963   }
3964   kmp_root_t *root = __kmp_root[gtid];
3965 
3966   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3967   KMP_ASSERT(KMP_UBER_GTID(gtid));
3968   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3969   KMP_ASSERT(root->r.r_active == FALSE);
3970 
3971   KMP_MB();
3972 
3973   kmp_info_t *thread = __kmp_threads[gtid];
3974   kmp_team_t *team = thread->th.th_team;
3975   kmp_task_team_t *task_team = thread->th.th_task_team;
3976 
3977   // we need to wait for the proxy tasks before finishing the thread
3978   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980     // the runtime is shutting down so we won't report any events
3981     thread->th.ompt_thread_info.state = ompt_state_undefined;
3982 #endif
3983     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984   }
3985 
3986   __kmp_reset_root(gtid, root);
3987 
3988   /* free up this thread slot */
3989   __kmp_gtid_set_specific(KMP_GTID_DNE);
3990 #ifdef KMP_TDATA_GTID
3991   __kmp_gtid = KMP_GTID_DNE;
3992 #endif
3993 
3994   KMP_MB();
3995   KC_TRACE(10,
3996            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3997 
3998   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3999 }
4000 
4001 #if KMP_OS_WINDOWS
4002 /* __kmp_forkjoin_lock must be already held
4003    Unregisters a root thread that is not the current thread.  Returns the number
4004    of __kmp_threads entries freed as a result. */
4005 static int __kmp_unregister_root_other_thread(int gtid) {
4006   kmp_root_t *root = __kmp_root[gtid];
4007   int r;
4008 
4009   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4010   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4011   KMP_ASSERT(KMP_UBER_GTID(gtid));
4012   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4013   KMP_ASSERT(root->r.r_active == FALSE);
4014 
4015   r = __kmp_reset_root(gtid, root);
4016   KC_TRACE(10,
4017            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4018   return r;
4019 }
4020 #endif
4021 
4022 #if KMP_DEBUG
4023 void __kmp_task_info() {
4024 
4025   kmp_int32 gtid = __kmp_entry_gtid();
4026   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4027   kmp_info_t *this_thr = __kmp_threads[gtid];
4028   kmp_team_t *steam = this_thr->th.th_serial_team;
4029   kmp_team_t *team = this_thr->th.th_team;
4030 
4031   __kmp_printf(
4032       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4033       "ptask=%p\n",
4034       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4035       team->t.t_implicit_task_taskdata[tid].td_parent);
4036 }
4037 #endif // KMP_DEBUG
4038 
4039 /* TODO optimize with one big memclr, take out what isn't needed, split
4040    responsibility to workers as much as possible, and delay initialization of
4041    features as much as possible  */
4042 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4043                                   int tid, int gtid) {
4044   /* this_thr->th.th_info.ds.ds_gtid is setup in
4045      kmp_allocate_thread/create_worker.
4046      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4047   kmp_info_t *master = team->t.t_threads[0];
4048   KMP_DEBUG_ASSERT(this_thr != NULL);
4049   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4050   KMP_DEBUG_ASSERT(team);
4051   KMP_DEBUG_ASSERT(team->t.t_threads);
4052   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4053   KMP_DEBUG_ASSERT(master);
4054   KMP_DEBUG_ASSERT(master->th.th_root);
4055 
4056   KMP_MB();
4057 
4058   TCW_SYNC_PTR(this_thr->th.th_team, team);
4059 
4060   this_thr->th.th_info.ds.ds_tid = tid;
4061   this_thr->th.th_set_nproc = 0;
4062   if (__kmp_tasking_mode != tskm_immediate_exec)
4063     // When tasking is possible, threads are not safe to reap until they are
4064     // done tasking; this will be set when tasking code is exited in wait
4065     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4066   else // no tasking --> always safe to reap
4067     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4068   this_thr->th.th_set_proc_bind = proc_bind_default;
4069 #if KMP_AFFINITY_SUPPORTED
4070   this_thr->th.th_new_place = this_thr->th.th_current_place;
4071 #endif
4072   this_thr->th.th_root = master->th.th_root;
4073 
4074   /* setup the thread's cache of the team structure */
4075   this_thr->th.th_team_nproc = team->t.t_nproc;
4076   this_thr->th.th_team_master = master;
4077   this_thr->th.th_team_serialized = team->t.t_serialized;
4078   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4079 
4080   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4081 
4082   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4083                 tid, gtid, this_thr, this_thr->th.th_current_task));
4084 
4085   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4086                            team, tid, TRUE);
4087 
4088   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4089                 tid, gtid, this_thr, this_thr->th.th_current_task));
4090   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4091   // __kmp_initialize_team()?
4092 
4093   /* TODO no worksharing in speculative threads */
4094   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4095 
4096   this_thr->th.th_local.this_construct = 0;
4097 
4098   if (!this_thr->th.th_pri_common) {
4099     this_thr->th.th_pri_common =
4100         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4101     if (__kmp_storage_map) {
4102       __kmp_print_storage_map_gtid(
4103           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4104           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4105     }
4106     this_thr->th.th_pri_head = NULL;
4107   }
4108 
4109   if (this_thr != master && // Master's CG root is initialized elsewhere
4110       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4111     // Make new thread's CG root same as master's
4112     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4113     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4114     if (tmp) {
4115       // worker changes CG, need to check if old CG should be freed
4116       int i = tmp->cg_nthreads--;
4117       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4118                      " on node %p of thread %p to %d\n",
4119                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4120       if (i == 1) {
4121         __kmp_free(tmp); // last thread left CG --> free it
4122       }
4123     }
4124     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4125     // Increment new thread's CG root's counter to add the new thread
4126     this_thr->th.th_cg_roots->cg_nthreads++;
4127     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4128                    " node %p of thread %p to %d\n",
4129                    this_thr, this_thr->th.th_cg_roots,
4130                    this_thr->th.th_cg_roots->cg_root,
4131                    this_thr->th.th_cg_roots->cg_nthreads));
4132     this_thr->th.th_current_task->td_icvs.thread_limit =
4133         this_thr->th.th_cg_roots->cg_thread_limit;
4134   }
4135 
4136   /* Initialize dynamic dispatch */
4137   {
4138     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4139     // Use team max_nproc since this will never change for the team.
4140     size_t disp_size =
4141         sizeof(dispatch_private_info_t) *
4142         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4143     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4144                   team->t.t_max_nproc));
4145     KMP_ASSERT(dispatch);
4146     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4147     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4148 
4149     dispatch->th_disp_index = 0;
4150     dispatch->th_doacross_buf_idx = 0;
4151     if (!dispatch->th_disp_buffer) {
4152       dispatch->th_disp_buffer =
4153           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4154 
4155       if (__kmp_storage_map) {
4156         __kmp_print_storage_map_gtid(
4157             gtid, &dispatch->th_disp_buffer[0],
4158             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4159                                           ? 1
4160                                           : __kmp_dispatch_num_buffers],
4161             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4162                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4163             gtid, team->t.t_id, gtid);
4164       }
4165     } else {
4166       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4167     }
4168 
4169     dispatch->th_dispatch_pr_current = 0;
4170     dispatch->th_dispatch_sh_current = 0;
4171 
4172     dispatch->th_deo_fcn = 0; /* ORDERED     */
4173     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4174   }
4175 
4176   this_thr->th.th_next_pool = NULL;
4177 
4178   if (!this_thr->th.th_task_state_memo_stack) {
4179     size_t i;
4180     this_thr->th.th_task_state_memo_stack =
4181         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4182     this_thr->th.th_task_state_top = 0;
4183     this_thr->th.th_task_state_stack_sz = 4;
4184     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4185          ++i) // zero init the stack
4186       this_thr->th.th_task_state_memo_stack[i] = 0;
4187   }
4188 
4189   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4190   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4191 
4192   KMP_MB();
4193 }
4194 
4195 /* allocate a new thread for the requesting team. this is only called from
4196    within a forkjoin critical section. we will first try to get an available
4197    thread from the thread pool. if none is available, we will fork a new one
4198    assuming we are able to create a new one. this should be assured, as the
4199    caller should check on this first. */
4200 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4201                                   int new_tid) {
4202   kmp_team_t *serial_team;
4203   kmp_info_t *new_thr;
4204   int new_gtid;
4205 
4206   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4207   KMP_DEBUG_ASSERT(root && team);
4208 #if !KMP_NESTED_HOT_TEAMS
4209   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4210 #endif
4211   KMP_MB();
4212 
4213   /* first, try to get one from the thread pool */
4214   if (__kmp_thread_pool) {
4215     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4216     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4217     if (new_thr == __kmp_thread_pool_insert_pt) {
4218       __kmp_thread_pool_insert_pt = NULL;
4219     }
4220     TCW_4(new_thr->th.th_in_pool, FALSE);
4221     __kmp_suspend_initialize_thread(new_thr);
4222     __kmp_lock_suspend_mx(new_thr);
4223     if (new_thr->th.th_active_in_pool == TRUE) {
4224       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4225       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4226       new_thr->th.th_active_in_pool = FALSE;
4227     }
4228     __kmp_unlock_suspend_mx(new_thr);
4229 
4230     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4231                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4232     KMP_ASSERT(!new_thr->th.th_team);
4233     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4234 
4235     /* setup the thread structure */
4236     __kmp_initialize_info(new_thr, team, new_tid,
4237                           new_thr->th.th_info.ds.ds_gtid);
4238     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4239 
4240     TCW_4(__kmp_nth, __kmp_nth + 1);
4241 
4242     new_thr->th.th_task_state = 0;
4243     new_thr->th.th_task_state_top = 0;
4244     new_thr->th.th_task_state_stack_sz = 4;
4245 
4246 #ifdef KMP_ADJUST_BLOCKTIME
4247     /* Adjust blocktime back to zero if necessary */
4248     /* Middle initialization might not have occurred yet */
4249     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4250       if (__kmp_nth > __kmp_avail_proc) {
4251         __kmp_zero_bt = TRUE;
4252       }
4253     }
4254 #endif /* KMP_ADJUST_BLOCKTIME */
4255 
4256 #if KMP_DEBUG
4257     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4258     // KMP_BARRIER_PARENT_FLAG.
4259     int b;
4260     kmp_balign_t *balign = new_thr->th.th_bar;
4261     for (b = 0; b < bs_last_barrier; ++b)
4262       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4263 #endif
4264 
4265     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4266                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4267 
4268     KMP_MB();
4269     return new_thr;
4270   }
4271 
4272   /* no, well fork a new one */
4273   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4274   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4275 
4276 #if KMP_USE_MONITOR
4277   // If this is the first worker thread the RTL is creating, then also
4278   // launch the monitor thread.  We try to do this as early as possible.
4279   if (!TCR_4(__kmp_init_monitor)) {
4280     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4281     if (!TCR_4(__kmp_init_monitor)) {
4282       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4283       TCW_4(__kmp_init_monitor, 1);
4284       __kmp_create_monitor(&__kmp_monitor);
4285       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4286 #if KMP_OS_WINDOWS
4287       // AC: wait until monitor has started. This is a fix for CQ232808.
4288       // The reason is that if the library is loaded/unloaded in a loop with
4289       // small (parallel) work in between, then there is high probability that
4290       // monitor thread started after the library shutdown. At shutdown it is
4291       // too late to cope with the problem, because when the master is in
4292       // DllMain (process detach) the monitor has no chances to start (it is
4293       // blocked), and master has no means to inform the monitor that the
4294       // library has gone, because all the memory which the monitor can access
4295       // is going to be released/reset.
4296       while (TCR_4(__kmp_init_monitor) < 2) {
4297         KMP_YIELD(TRUE);
4298       }
4299       KF_TRACE(10, ("after monitor thread has started\n"));
4300 #endif
4301     }
4302     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4303   }
4304 #endif
4305 
4306   KMP_MB();
4307   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4308     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4309   }
4310 
4311   /* allocate space for it. */
4312   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4313 
4314   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4315 
4316 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4317   // suppress race conditions detection on synchronization flags in debug mode
4318   // this helps to analyze library internals eliminating false positives
4319   __itt_suppress_mark_range(
4320       __itt_suppress_range, __itt_suppress_threading_errors,
4321       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4322   __itt_suppress_mark_range(
4323       __itt_suppress_range, __itt_suppress_threading_errors,
4324       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4325 #if KMP_OS_WINDOWS
4326   __itt_suppress_mark_range(
4327       __itt_suppress_range, __itt_suppress_threading_errors,
4328       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4329 #else
4330   __itt_suppress_mark_range(__itt_suppress_range,
4331                             __itt_suppress_threading_errors,
4332                             &new_thr->th.th_suspend_init_count,
4333                             sizeof(new_thr->th.th_suspend_init_count));
4334 #endif
4335   // TODO: check if we need to also suppress b_arrived flags
4336   __itt_suppress_mark_range(__itt_suppress_range,
4337                             __itt_suppress_threading_errors,
4338                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4339                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4340   __itt_suppress_mark_range(__itt_suppress_range,
4341                             __itt_suppress_threading_errors,
4342                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4343                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4344   __itt_suppress_mark_range(__itt_suppress_range,
4345                             __itt_suppress_threading_errors,
4346                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4347                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4348 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4349   if (__kmp_storage_map) {
4350     __kmp_print_thread_storage_map(new_thr, new_gtid);
4351   }
4352 
4353   // add the reserve serialized team, initialized from the team's master thread
4354   {
4355     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4356     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4357     new_thr->th.th_serial_team = serial_team =
4358         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4359 #if OMPT_SUPPORT
4360                                           ompt_data_none, // root parallel id
4361 #endif
4362                                           proc_bind_default, &r_icvs,
4363                                           0 USE_NESTED_HOT_ARG(NULL));
4364   }
4365   KMP_ASSERT(serial_team);
4366   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4367   // execution (it is unused for now).
4368   serial_team->t.t_threads[0] = new_thr;
4369   KF_TRACE(10,
4370            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4371             new_thr));
4372 
4373   /* setup the thread structures */
4374   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4375 
4376 #if USE_FAST_MEMORY
4377   __kmp_initialize_fast_memory(new_thr);
4378 #endif /* USE_FAST_MEMORY */
4379 
4380 #if KMP_USE_BGET
4381   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4382   __kmp_initialize_bget(new_thr);
4383 #endif
4384 
4385   __kmp_init_random(new_thr); // Initialize random number generator
4386 
4387   /* Initialize these only once when thread is grabbed for a team allocation */
4388   KA_TRACE(20,
4389            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4390             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4391 
4392   int b;
4393   kmp_balign_t *balign = new_thr->th.th_bar;
4394   for (b = 0; b < bs_last_barrier; ++b) {
4395     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4396     balign[b].bb.team = NULL;
4397     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4398     balign[b].bb.use_oncore_barrier = 0;
4399   }
4400 
4401   new_thr->th.th_spin_here = FALSE;
4402   new_thr->th.th_next_waiting = 0;
4403 #if KMP_OS_UNIX
4404   new_thr->th.th_blocking = false;
4405 #endif
4406 
4407 #if KMP_AFFINITY_SUPPORTED
4408   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4409   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4410   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4411   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4412 #endif
4413   new_thr->th.th_def_allocator = __kmp_def_allocator;
4414   new_thr->th.th_prev_level = 0;
4415   new_thr->th.th_prev_num_threads = 1;
4416 
4417   TCW_4(new_thr->th.th_in_pool, FALSE);
4418   new_thr->th.th_active_in_pool = FALSE;
4419   TCW_4(new_thr->th.th_active, TRUE);
4420 
4421   /* adjust the global counters */
4422   __kmp_all_nth++;
4423   __kmp_nth++;
4424 
4425   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4426   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4427   if (__kmp_adjust_gtid_mode) {
4428     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4429       if (TCR_4(__kmp_gtid_mode) != 2) {
4430         TCW_4(__kmp_gtid_mode, 2);
4431       }
4432     } else {
4433       if (TCR_4(__kmp_gtid_mode) != 1) {
4434         TCW_4(__kmp_gtid_mode, 1);
4435       }
4436     }
4437   }
4438 
4439 #ifdef KMP_ADJUST_BLOCKTIME
4440   /* Adjust blocktime back to zero if necessary       */
4441   /* Middle initialization might not have occurred yet */
4442   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4443     if (__kmp_nth > __kmp_avail_proc) {
4444       __kmp_zero_bt = TRUE;
4445     }
4446   }
4447 #endif /* KMP_ADJUST_BLOCKTIME */
4448 
4449   /* actually fork it and create the new worker thread */
4450   KF_TRACE(
4451       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4452   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4453   KF_TRACE(10,
4454            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4455 
4456   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4457                 new_gtid));
4458   KMP_MB();
4459   return new_thr;
4460 }
4461 
4462 /* Reinitialize team for reuse.
4463    The hot team code calls this case at every fork barrier, so EPCC barrier
4464    test are extremely sensitive to changes in it, esp. writes to the team
4465    struct, which cause a cache invalidation in all threads.
4466    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4467 static void __kmp_reinitialize_team(kmp_team_t *team,
4468                                     kmp_internal_control_t *new_icvs,
4469                                     ident_t *loc) {
4470   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4471                 team->t.t_threads[0], team));
4472   KMP_DEBUG_ASSERT(team && new_icvs);
4473   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4474   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4475 
4476   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4477   // Copy ICVs to the master thread's implicit taskdata
4478   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4479   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4480 
4481   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4482                 team->t.t_threads[0], team));
4483 }
4484 
4485 /* Initialize the team data structure.
4486    This assumes the t_threads and t_max_nproc are already set.
4487    Also, we don't touch the arguments */
4488 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4489                                   kmp_internal_control_t *new_icvs,
4490                                   ident_t *loc) {
4491   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4492 
4493   /* verify */
4494   KMP_DEBUG_ASSERT(team);
4495   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4496   KMP_DEBUG_ASSERT(team->t.t_threads);
4497   KMP_MB();
4498 
4499   team->t.t_master_tid = 0; /* not needed */
4500   /* team->t.t_master_bar;        not needed */
4501   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4502   team->t.t_nproc = new_nproc;
4503 
4504   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4505   team->t.t_next_pool = NULL;
4506   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4507    * up hot team */
4508 
4509   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4510   team->t.t_invoke = NULL; /* not needed */
4511 
4512   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4513   team->t.t_sched.sched = new_icvs->sched.sched;
4514 
4515 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4516   team->t.t_fp_control_saved = FALSE; /* not needed */
4517   team->t.t_x87_fpu_control_word = 0; /* not needed */
4518   team->t.t_mxcsr = 0; /* not needed */
4519 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4520 
4521   team->t.t_construct = 0;
4522 
4523   team->t.t_ordered.dt.t_value = 0;
4524   team->t.t_master_active = FALSE;
4525 
4526 #ifdef KMP_DEBUG
4527   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4528 #endif
4529 #if KMP_OS_WINDOWS
4530   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4531 #endif
4532 
4533   team->t.t_control_stack_top = NULL;
4534 
4535   __kmp_reinitialize_team(team, new_icvs, loc);
4536 
4537   KMP_MB();
4538   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4539 }
4540 
4541 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4542 /* Sets full mask for thread and returns old mask, no changes to structures. */
4543 static void
4544 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4545   if (KMP_AFFINITY_CAPABLE()) {
4546     int status;
4547     if (old_mask != NULL) {
4548       status = __kmp_get_system_affinity(old_mask, TRUE);
4549       int error = errno;
4550       if (status != 0) {
4551         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4552                     __kmp_msg_null);
4553       }
4554     }
4555     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4556   }
4557 }
4558 #endif
4559 
4560 #if KMP_AFFINITY_SUPPORTED
4561 
4562 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4563 // It calculates the worker + master thread's partition based upon the parent
4564 // thread's partition, and binds each worker to a thread in their partition.
4565 // The master thread's partition should already include its current binding.
4566 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4567   // Copy the master thread's place partition to the team struct
4568   kmp_info_t *master_th = team->t.t_threads[0];
4569   KMP_DEBUG_ASSERT(master_th != NULL);
4570   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4571   int first_place = master_th->th.th_first_place;
4572   int last_place = master_th->th.th_last_place;
4573   int masters_place = master_th->th.th_current_place;
4574   team->t.t_first_place = first_place;
4575   team->t.t_last_place = last_place;
4576 
4577   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4578                 "bound to place %d partition = [%d,%d]\n",
4579                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4580                 team->t.t_id, masters_place, first_place, last_place));
4581 
4582   switch (proc_bind) {
4583 
4584   case proc_bind_default:
4585     // serial teams might have the proc_bind policy set to proc_bind_default. It
4586     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4587     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4588     break;
4589 
4590   case proc_bind_master: {
4591     int f;
4592     int n_th = team->t.t_nproc;
4593     for (f = 1; f < n_th; f++) {
4594       kmp_info_t *th = team->t.t_threads[f];
4595       KMP_DEBUG_ASSERT(th != NULL);
4596       th->th.th_first_place = first_place;
4597       th->th.th_last_place = last_place;
4598       th->th.th_new_place = masters_place;
4599       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4600           team->t.t_display_affinity != 1) {
4601         team->t.t_display_affinity = 1;
4602       }
4603 
4604       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4605                      "partition = [%d,%d]\n",
4606                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4607                      f, masters_place, first_place, last_place));
4608     }
4609   } break;
4610 
4611   case proc_bind_close: {
4612     int f;
4613     int n_th = team->t.t_nproc;
4614     int n_places;
4615     if (first_place <= last_place) {
4616       n_places = last_place - first_place + 1;
4617     } else {
4618       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4619     }
4620     if (n_th <= n_places) {
4621       int place = masters_place;
4622       for (f = 1; f < n_th; f++) {
4623         kmp_info_t *th = team->t.t_threads[f];
4624         KMP_DEBUG_ASSERT(th != NULL);
4625 
4626         if (place == last_place) {
4627           place = first_place;
4628         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4629           place = 0;
4630         } else {
4631           place++;
4632         }
4633         th->th.th_first_place = first_place;
4634         th->th.th_last_place = last_place;
4635         th->th.th_new_place = place;
4636         if (__kmp_display_affinity && place != th->th.th_current_place &&
4637             team->t.t_display_affinity != 1) {
4638           team->t.t_display_affinity = 1;
4639         }
4640 
4641         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4642                        "partition = [%d,%d]\n",
4643                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4644                        team->t.t_id, f, place, first_place, last_place));
4645       }
4646     } else {
4647       int S, rem, gap, s_count;
4648       S = n_th / n_places;
4649       s_count = 0;
4650       rem = n_th - (S * n_places);
4651       gap = rem > 0 ? n_places / rem : n_places;
4652       int place = masters_place;
4653       int gap_ct = gap;
4654       for (f = 0; f < n_th; f++) {
4655         kmp_info_t *th = team->t.t_threads[f];
4656         KMP_DEBUG_ASSERT(th != NULL);
4657 
4658         th->th.th_first_place = first_place;
4659         th->th.th_last_place = last_place;
4660         th->th.th_new_place = place;
4661         if (__kmp_display_affinity && place != th->th.th_current_place &&
4662             team->t.t_display_affinity != 1) {
4663           team->t.t_display_affinity = 1;
4664         }
4665         s_count++;
4666 
4667         if ((s_count == S) && rem && (gap_ct == gap)) {
4668           // do nothing, add an extra thread to place on next iteration
4669         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4670           // we added an extra thread to this place; move to next place
4671           if (place == last_place) {
4672             place = first_place;
4673           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4674             place = 0;
4675           } else {
4676             place++;
4677           }
4678           s_count = 0;
4679           gap_ct = 1;
4680           rem--;
4681         } else if (s_count == S) { // place full; don't add extra
4682           if (place == last_place) {
4683             place = first_place;
4684           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4685             place = 0;
4686           } else {
4687             place++;
4688           }
4689           gap_ct++;
4690           s_count = 0;
4691         }
4692 
4693         KA_TRACE(100,
4694                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4695                   "partition = [%d,%d]\n",
4696                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4697                   th->th.th_new_place, first_place, last_place));
4698       }
4699       KMP_DEBUG_ASSERT(place == masters_place);
4700     }
4701   } break;
4702 
4703   case proc_bind_spread: {
4704     int f;
4705     int n_th = team->t.t_nproc;
4706     int n_places;
4707     int thidx;
4708     if (first_place <= last_place) {
4709       n_places = last_place - first_place + 1;
4710     } else {
4711       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4712     }
4713     if (n_th <= n_places) {
4714       int place = -1;
4715 
4716       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4717         int S = n_places / n_th;
4718         int s_count, rem, gap, gap_ct;
4719 
4720         place = masters_place;
4721         rem = n_places - n_th * S;
4722         gap = rem ? n_th / rem : 1;
4723         gap_ct = gap;
4724         thidx = n_th;
4725         if (update_master_only == 1)
4726           thidx = 1;
4727         for (f = 0; f < thidx; f++) {
4728           kmp_info_t *th = team->t.t_threads[f];
4729           KMP_DEBUG_ASSERT(th != NULL);
4730 
4731           th->th.th_first_place = place;
4732           th->th.th_new_place = place;
4733           if (__kmp_display_affinity && place != th->th.th_current_place &&
4734               team->t.t_display_affinity != 1) {
4735             team->t.t_display_affinity = 1;
4736           }
4737           s_count = 1;
4738           while (s_count < S) {
4739             if (place == last_place) {
4740               place = first_place;
4741             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4742               place = 0;
4743             } else {
4744               place++;
4745             }
4746             s_count++;
4747           }
4748           if (rem && (gap_ct == gap)) {
4749             if (place == last_place) {
4750               place = first_place;
4751             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4752               place = 0;
4753             } else {
4754               place++;
4755             }
4756             rem--;
4757             gap_ct = 0;
4758           }
4759           th->th.th_last_place = place;
4760           gap_ct++;
4761 
4762           if (place == last_place) {
4763             place = first_place;
4764           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4765             place = 0;
4766           } else {
4767             place++;
4768           }
4769 
4770           KA_TRACE(100,
4771                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4772                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4773                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4774                     f, th->th.th_new_place, th->th.th_first_place,
4775                     th->th.th_last_place, __kmp_affinity_num_masks));
4776         }
4777       } else {
4778         /* Having uniform space of available computation places I can create
4779            T partitions of round(P/T) size and put threads into the first
4780            place of each partition. */
4781         double current = static_cast<double>(masters_place);
4782         double spacing =
4783             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4784         int first, last;
4785         kmp_info_t *th;
4786 
4787         thidx = n_th + 1;
4788         if (update_master_only == 1)
4789           thidx = 1;
4790         for (f = 0; f < thidx; f++) {
4791           first = static_cast<int>(current);
4792           last = static_cast<int>(current + spacing) - 1;
4793           KMP_DEBUG_ASSERT(last >= first);
4794           if (first >= n_places) {
4795             if (masters_place) {
4796               first -= n_places;
4797               last -= n_places;
4798               if (first == (masters_place + 1)) {
4799                 KMP_DEBUG_ASSERT(f == n_th);
4800                 first--;
4801               }
4802               if (last == masters_place) {
4803                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4804                 last--;
4805               }
4806             } else {
4807               KMP_DEBUG_ASSERT(f == n_th);
4808               first = 0;
4809               last = 0;
4810             }
4811           }
4812           if (last >= n_places) {
4813             last = (n_places - 1);
4814           }
4815           place = first;
4816           current += spacing;
4817           if (f < n_th) {
4818             KMP_DEBUG_ASSERT(0 <= first);
4819             KMP_DEBUG_ASSERT(n_places > first);
4820             KMP_DEBUG_ASSERT(0 <= last);
4821             KMP_DEBUG_ASSERT(n_places > last);
4822             KMP_DEBUG_ASSERT(last_place >= first_place);
4823             th = team->t.t_threads[f];
4824             KMP_DEBUG_ASSERT(th);
4825             th->th.th_first_place = first;
4826             th->th.th_new_place = place;
4827             th->th.th_last_place = last;
4828             if (__kmp_display_affinity && place != th->th.th_current_place &&
4829                 team->t.t_display_affinity != 1) {
4830               team->t.t_display_affinity = 1;
4831             }
4832             KA_TRACE(100,
4833                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4834                       "partition = [%d,%d], spacing = %.4f\n",
4835                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4836                       team->t.t_id, f, th->th.th_new_place,
4837                       th->th.th_first_place, th->th.th_last_place, spacing));
4838           }
4839         }
4840       }
4841       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4842     } else {
4843       int S, rem, gap, s_count;
4844       S = n_th / n_places;
4845       s_count = 0;
4846       rem = n_th - (S * n_places);
4847       gap = rem > 0 ? n_places / rem : n_places;
4848       int place = masters_place;
4849       int gap_ct = gap;
4850       thidx = n_th;
4851       if (update_master_only == 1)
4852         thidx = 1;
4853       for (f = 0; f < thidx; f++) {
4854         kmp_info_t *th = team->t.t_threads[f];
4855         KMP_DEBUG_ASSERT(th != NULL);
4856 
4857         th->th.th_first_place = place;
4858         th->th.th_last_place = place;
4859         th->th.th_new_place = place;
4860         if (__kmp_display_affinity && place != th->th.th_current_place &&
4861             team->t.t_display_affinity != 1) {
4862           team->t.t_display_affinity = 1;
4863         }
4864         s_count++;
4865 
4866         if ((s_count == S) && rem && (gap_ct == gap)) {
4867           // do nothing, add an extra thread to place on next iteration
4868         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4869           // we added an extra thread to this place; move on to next place
4870           if (place == last_place) {
4871             place = first_place;
4872           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4873             place = 0;
4874           } else {
4875             place++;
4876           }
4877           s_count = 0;
4878           gap_ct = 1;
4879           rem--;
4880         } else if (s_count == S) { // place is full; don't add extra thread
4881           if (place == last_place) {
4882             place = first_place;
4883           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4884             place = 0;
4885           } else {
4886             place++;
4887           }
4888           gap_ct++;
4889           s_count = 0;
4890         }
4891 
4892         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4893                        "partition = [%d,%d]\n",
4894                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4895                        team->t.t_id, f, th->th.th_new_place,
4896                        th->th.th_first_place, th->th.th_last_place));
4897       }
4898       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4899     }
4900   } break;
4901 
4902   default:
4903     break;
4904   }
4905 
4906   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4907 }
4908 
4909 #endif // KMP_AFFINITY_SUPPORTED
4910 
4911 /* allocate a new team data structure to use.  take one off of the free pool if
4912    available */
4913 kmp_team_t *
4914 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4915 #if OMPT_SUPPORT
4916                     ompt_data_t ompt_parallel_data,
4917 #endif
4918                     kmp_proc_bind_t new_proc_bind,
4919                     kmp_internal_control_t *new_icvs,
4920                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4921   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4922   int f;
4923   kmp_team_t *team;
4924   int use_hot_team = !root->r.r_active;
4925   int level = 0;
4926 
4927   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4928   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4929   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4930   KMP_MB();
4931 
4932 #if KMP_NESTED_HOT_TEAMS
4933   kmp_hot_team_ptr_t *hot_teams;
4934   if (master) {
4935     team = master->th.th_team;
4936     level = team->t.t_active_level;
4937     if (master->th.th_teams_microtask) { // in teams construct?
4938       if (master->th.th_teams_size.nteams > 1 &&
4939           ( // #teams > 1
4940               team->t.t_pkfn ==
4941                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4942               master->th.th_teams_level <
4943                   team->t.t_level)) { // or nested parallel inside the teams
4944         ++level; // not increment if #teams==1, or for outer fork of the teams;
4945         // increment otherwise
4946       }
4947     }
4948     hot_teams = master->th.th_hot_teams;
4949     if (level < __kmp_hot_teams_max_level && hot_teams &&
4950         hot_teams[level].hot_team) {
4951       // hot team has already been allocated for given level
4952       use_hot_team = 1;
4953     } else {
4954       use_hot_team = 0;
4955     }
4956   } else {
4957     // check we won't access uninitialized hot_teams, just in case
4958     KMP_DEBUG_ASSERT(new_nproc == 1);
4959   }
4960 #endif
4961   // Optimization to use a "hot" team
4962   if (use_hot_team && new_nproc > 1) {
4963     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4964 #if KMP_NESTED_HOT_TEAMS
4965     team = hot_teams[level].hot_team;
4966 #else
4967     team = root->r.r_hot_team;
4968 #endif
4969 #if KMP_DEBUG
4970     if (__kmp_tasking_mode != tskm_immediate_exec) {
4971       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4972                     "task_team[1] = %p before reinit\n",
4973                     team->t.t_task_team[0], team->t.t_task_team[1]));
4974     }
4975 #endif
4976 
4977     // Has the number of threads changed?
4978     /* Let's assume the most common case is that the number of threads is
4979        unchanged, and put that case first. */
4980     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4981       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4982       // This case can mean that omp_set_num_threads() was called and the hot
4983       // team size was already reduced, so we check the special flag
4984       if (team->t.t_size_changed == -1) {
4985         team->t.t_size_changed = 1;
4986       } else {
4987         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4988       }
4989 
4990       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4991       kmp_r_sched_t new_sched = new_icvs->sched;
4992       // set master's schedule as new run-time schedule
4993       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4994 
4995       __kmp_reinitialize_team(team, new_icvs,
4996                               root->r.r_uber_thread->th.th_ident);
4997 
4998       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4999                     team->t.t_threads[0], team));
5000       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5001 
5002 #if KMP_AFFINITY_SUPPORTED
5003       if ((team->t.t_size_changed == 0) &&
5004           (team->t.t_proc_bind == new_proc_bind)) {
5005         if (new_proc_bind == proc_bind_spread) {
5006           __kmp_partition_places(
5007               team, 1); // add flag to update only master for spread
5008         }
5009         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5010                        "proc_bind = %d, partition = [%d,%d]\n",
5011                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5012                        team->t.t_last_place));
5013       } else {
5014         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5015         __kmp_partition_places(team);
5016       }
5017 #else
5018       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5019 #endif /* KMP_AFFINITY_SUPPORTED */
5020     } else if (team->t.t_nproc > new_nproc) {
5021       KA_TRACE(20,
5022                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5023                 new_nproc));
5024 
5025       team->t.t_size_changed = 1;
5026 #if KMP_NESTED_HOT_TEAMS
5027       if (__kmp_hot_teams_mode == 0) {
5028         // AC: saved number of threads should correspond to team's value in this
5029         // mode, can be bigger in mode 1, when hot team has threads in reserve
5030         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5031         hot_teams[level].hot_team_nth = new_nproc;
5032 #endif // KMP_NESTED_HOT_TEAMS
5033         /* release the extra threads we don't need any more */
5034         for (f = new_nproc; f < team->t.t_nproc; f++) {
5035           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5036           if (__kmp_tasking_mode != tskm_immediate_exec) {
5037             // When decreasing team size, threads no longer in the team should
5038             // unref task team.
5039             team->t.t_threads[f]->th.th_task_team = NULL;
5040           }
5041           __kmp_free_thread(team->t.t_threads[f]);
5042           team->t.t_threads[f] = NULL;
5043         }
5044 #if KMP_NESTED_HOT_TEAMS
5045       } // (__kmp_hot_teams_mode == 0)
5046       else {
5047         // When keeping extra threads in team, switch threads to wait on own
5048         // b_go flag
5049         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5050           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5051           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5052           for (int b = 0; b < bs_last_barrier; ++b) {
5053             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5054               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5055             }
5056             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5057           }
5058         }
5059       }
5060 #endif // KMP_NESTED_HOT_TEAMS
5061       team->t.t_nproc = new_nproc;
5062       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5063       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5064       __kmp_reinitialize_team(team, new_icvs,
5065                               root->r.r_uber_thread->th.th_ident);
5066 
5067       // Update remaining threads
5068       for (f = 0; f < new_nproc; ++f) {
5069         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5070       }
5071 
5072       // restore the current task state of the master thread: should be the
5073       // implicit task
5074       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5075                     team->t.t_threads[0], team));
5076 
5077       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5078 
5079 #ifdef KMP_DEBUG
5080       for (f = 0; f < team->t.t_nproc; f++) {
5081         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5082                          team->t.t_threads[f]->th.th_team_nproc ==
5083                              team->t.t_nproc);
5084       }
5085 #endif
5086 
5087       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5088 #if KMP_AFFINITY_SUPPORTED
5089       __kmp_partition_places(team);
5090 #endif
5091     } else { // team->t.t_nproc < new_nproc
5092 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5093       kmp_affin_mask_t *old_mask;
5094       if (KMP_AFFINITY_CAPABLE()) {
5095         KMP_CPU_ALLOC(old_mask);
5096       }
5097 #endif
5098 
5099       KA_TRACE(20,
5100                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5101                 new_nproc));
5102 
5103       team->t.t_size_changed = 1;
5104 
5105 #if KMP_NESTED_HOT_TEAMS
5106       int avail_threads = hot_teams[level].hot_team_nth;
5107       if (new_nproc < avail_threads)
5108         avail_threads = new_nproc;
5109       kmp_info_t **other_threads = team->t.t_threads;
5110       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5111         // Adjust barrier data of reserved threads (if any) of the team
5112         // Other data will be set in __kmp_initialize_info() below.
5113         int b;
5114         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5115         for (b = 0; b < bs_last_barrier; ++b) {
5116           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5117           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5118 #if USE_DEBUGGER
5119           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5120 #endif
5121         }
5122       }
5123       if (hot_teams[level].hot_team_nth >= new_nproc) {
5124         // we have all needed threads in reserve, no need to allocate any
5125         // this only possible in mode 1, cannot have reserved threads in mode 0
5126         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5127         team->t.t_nproc = new_nproc; // just get reserved threads involved
5128       } else {
5129         // we may have some threads in reserve, but not enough
5130         team->t.t_nproc =
5131             hot_teams[level]
5132                 .hot_team_nth; // get reserved threads involved if any
5133         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5134 #endif // KMP_NESTED_HOT_TEAMS
5135         if (team->t.t_max_nproc < new_nproc) {
5136           /* reallocate larger arrays */
5137           __kmp_reallocate_team_arrays(team, new_nproc);
5138           __kmp_reinitialize_team(team, new_icvs, NULL);
5139         }
5140 
5141 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5142         /* Temporarily set full mask for master thread before creation of
5143            workers. The reason is that workers inherit the affinity from master,
5144            so if a lot of workers are created on the single core quickly, they
5145            don't get a chance to set their own affinity for a long time. */
5146         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5147 #endif
5148 
5149         /* allocate new threads for the hot team */
5150         for (f = team->t.t_nproc; f < new_nproc; f++) {
5151           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5152           KMP_DEBUG_ASSERT(new_worker);
5153           team->t.t_threads[f] = new_worker;
5154 
5155           KA_TRACE(20,
5156                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5157                     "join=%llu, plain=%llu\n",
5158                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5159                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5160                     team->t.t_bar[bs_plain_barrier].b_arrived));
5161 
5162           { // Initialize barrier data for new threads.
5163             int b;
5164             kmp_balign_t *balign = new_worker->th.th_bar;
5165             for (b = 0; b < bs_last_barrier; ++b) {
5166               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5167               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5168                                KMP_BARRIER_PARENT_FLAG);
5169 #if USE_DEBUGGER
5170               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5171 #endif
5172             }
5173           }
5174         }
5175 
5176 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5177         if (KMP_AFFINITY_CAPABLE()) {
5178           /* Restore initial master thread's affinity mask */
5179           __kmp_set_system_affinity(old_mask, TRUE);
5180           KMP_CPU_FREE(old_mask);
5181         }
5182 #endif
5183 #if KMP_NESTED_HOT_TEAMS
5184       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5185 #endif // KMP_NESTED_HOT_TEAMS
5186       /* make sure everyone is syncronized */
5187       int old_nproc = team->t.t_nproc; // save old value and use to update only
5188       // new threads below
5189       __kmp_initialize_team(team, new_nproc, new_icvs,
5190                             root->r.r_uber_thread->th.th_ident);
5191 
5192       /* reinitialize the threads */
5193       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5194       for (f = 0; f < team->t.t_nproc; ++f)
5195         __kmp_initialize_info(team->t.t_threads[f], team, f,
5196                               __kmp_gtid_from_tid(f, team));
5197 
5198       if (level) { // set th_task_state for new threads in nested hot team
5199         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5200         // only need to set the th_task_state for the new threads. th_task_state
5201         // for master thread will not be accurate until after this in
5202         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5203         // correct value.
5204         for (f = old_nproc; f < team->t.t_nproc; ++f)
5205           team->t.t_threads[f]->th.th_task_state =
5206               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5207       } else { // set th_task_state for new threads in non-nested hot team
5208         int old_state =
5209             team->t.t_threads[0]->th.th_task_state; // copy master's state
5210         for (f = old_nproc; f < team->t.t_nproc; ++f)
5211           team->t.t_threads[f]->th.th_task_state = old_state;
5212       }
5213 
5214 #ifdef KMP_DEBUG
5215       for (f = 0; f < team->t.t_nproc; ++f) {
5216         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5217                          team->t.t_threads[f]->th.th_team_nproc ==
5218                              team->t.t_nproc);
5219       }
5220 #endif
5221 
5222       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5223 #if KMP_AFFINITY_SUPPORTED
5224       __kmp_partition_places(team);
5225 #endif
5226     } // Check changes in number of threads
5227 
5228     kmp_info_t *master = team->t.t_threads[0];
5229     if (master->th.th_teams_microtask) {
5230       for (f = 1; f < new_nproc; ++f) {
5231         // propagate teams construct specific info to workers
5232         kmp_info_t *thr = team->t.t_threads[f];
5233         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5234         thr->th.th_teams_level = master->th.th_teams_level;
5235         thr->th.th_teams_size = master->th.th_teams_size;
5236       }
5237     }
5238 #if KMP_NESTED_HOT_TEAMS
5239     if (level) {
5240       // Sync barrier state for nested hot teams, not needed for outermost hot
5241       // team.
5242       for (f = 1; f < new_nproc; ++f) {
5243         kmp_info_t *thr = team->t.t_threads[f];
5244         int b;
5245         kmp_balign_t *balign = thr->th.th_bar;
5246         for (b = 0; b < bs_last_barrier; ++b) {
5247           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5248           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5249 #if USE_DEBUGGER
5250           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5251 #endif
5252         }
5253       }
5254     }
5255 #endif // KMP_NESTED_HOT_TEAMS
5256 
5257     /* reallocate space for arguments if necessary */
5258     __kmp_alloc_argv_entries(argc, team, TRUE);
5259     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5260     // The hot team re-uses the previous task team,
5261     // if untouched during the previous release->gather phase.
5262 
5263     KF_TRACE(10, (" hot_team = %p\n", team));
5264 
5265 #if KMP_DEBUG
5266     if (__kmp_tasking_mode != tskm_immediate_exec) {
5267       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5268                     "task_team[1] = %p after reinit\n",
5269                     team->t.t_task_team[0], team->t.t_task_team[1]));
5270     }
5271 #endif
5272 
5273 #if OMPT_SUPPORT
5274     __ompt_team_assign_id(team, ompt_parallel_data);
5275 #endif
5276 
5277     KMP_MB();
5278 
5279     return team;
5280   }
5281 
5282   /* next, let's try to take one from the team pool */
5283   KMP_MB();
5284   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5285     /* TODO: consider resizing undersized teams instead of reaping them, now
5286        that we have a resizing mechanism */
5287     if (team->t.t_max_nproc >= max_nproc) {
5288       /* take this team from the team pool */
5289       __kmp_team_pool = team->t.t_next_pool;
5290 
5291       /* setup the team for fresh use */
5292       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5293 
5294       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5295                     "task_team[1] %p to NULL\n",
5296                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5297       team->t.t_task_team[0] = NULL;
5298       team->t.t_task_team[1] = NULL;
5299 
5300       /* reallocate space for arguments if necessary */
5301       __kmp_alloc_argv_entries(argc, team, TRUE);
5302       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5303 
5304       KA_TRACE(
5305           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5306                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5307       { // Initialize barrier data.
5308         int b;
5309         for (b = 0; b < bs_last_barrier; ++b) {
5310           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5311 #if USE_DEBUGGER
5312           team->t.t_bar[b].b_master_arrived = 0;
5313           team->t.t_bar[b].b_team_arrived = 0;
5314 #endif
5315         }
5316       }
5317 
5318       team->t.t_proc_bind = new_proc_bind;
5319 
5320       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5321                     team->t.t_id));
5322 
5323 #if OMPT_SUPPORT
5324       __ompt_team_assign_id(team, ompt_parallel_data);
5325 #endif
5326 
5327       KMP_MB();
5328 
5329       return team;
5330     }
5331 
5332     /* reap team if it is too small, then loop back and check the next one */
5333     // not sure if this is wise, but, will be redone during the hot-teams
5334     // rewrite.
5335     /* TODO: Use technique to find the right size hot-team, don't reap them */
5336     team = __kmp_reap_team(team);
5337     __kmp_team_pool = team;
5338   }
5339 
5340   /* nothing available in the pool, no matter, make a new team! */
5341   KMP_MB();
5342   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5343 
5344   /* and set it up */
5345   team->t.t_max_nproc = max_nproc;
5346   /* NOTE well, for some reason allocating one big buffer and dividing it up
5347      seems to really hurt performance a lot on the P4, so, let's not use this */
5348   __kmp_allocate_team_arrays(team, max_nproc);
5349 
5350   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5351   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5352 
5353   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5354                 "%p to NULL\n",
5355                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5356   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5357   // memory, no need to duplicate
5358   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5359   // memory, no need to duplicate
5360 
5361   if (__kmp_storage_map) {
5362     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5363   }
5364 
5365   /* allocate space for arguments */
5366   __kmp_alloc_argv_entries(argc, team, FALSE);
5367   team->t.t_argc = argc;
5368 
5369   KA_TRACE(20,
5370            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5371             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5372   { // Initialize barrier data.
5373     int b;
5374     for (b = 0; b < bs_last_barrier; ++b) {
5375       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5376 #if USE_DEBUGGER
5377       team->t.t_bar[b].b_master_arrived = 0;
5378       team->t.t_bar[b].b_team_arrived = 0;
5379 #endif
5380     }
5381   }
5382 
5383   team->t.t_proc_bind = new_proc_bind;
5384 
5385 #if OMPT_SUPPORT
5386   __ompt_team_assign_id(team, ompt_parallel_data);
5387   team->t.ompt_serialized_team_info = NULL;
5388 #endif
5389 
5390   KMP_MB();
5391 
5392   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5393                 team->t.t_id));
5394 
5395   return team;
5396 }
5397 
5398 /* TODO implement hot-teams at all levels */
5399 /* TODO implement lazy thread release on demand (disband request) */
5400 
5401 /* free the team.  return it to the team pool.  release all the threads
5402  * associated with it */
5403 void __kmp_free_team(kmp_root_t *root,
5404                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5405   int f;
5406   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5407                 team->t.t_id));
5408 
5409   /* verify state */
5410   KMP_DEBUG_ASSERT(root);
5411   KMP_DEBUG_ASSERT(team);
5412   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5413   KMP_DEBUG_ASSERT(team->t.t_threads);
5414 
5415   int use_hot_team = team == root->r.r_hot_team;
5416 #if KMP_NESTED_HOT_TEAMS
5417   int level;
5418   kmp_hot_team_ptr_t *hot_teams;
5419   if (master) {
5420     level = team->t.t_active_level - 1;
5421     if (master->th.th_teams_microtask) { // in teams construct?
5422       if (master->th.th_teams_size.nteams > 1) {
5423         ++level; // level was not increased in teams construct for
5424         // team_of_masters
5425       }
5426       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5427           master->th.th_teams_level == team->t.t_level) {
5428         ++level; // level was not increased in teams construct for
5429         // team_of_workers before the parallel
5430       } // team->t.t_level will be increased inside parallel
5431     }
5432     hot_teams = master->th.th_hot_teams;
5433     if (level < __kmp_hot_teams_max_level) {
5434       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5435       use_hot_team = 1;
5436     }
5437   }
5438 #endif // KMP_NESTED_HOT_TEAMS
5439 
5440   /* team is done working */
5441   TCW_SYNC_PTR(team->t.t_pkfn,
5442                NULL); // Important for Debugging Support Library.
5443 #if KMP_OS_WINDOWS
5444   team->t.t_copyin_counter = 0; // init counter for possible reuse
5445 #endif
5446   // Do not reset pointer to parent team to NULL for hot teams.
5447 
5448   /* if we are non-hot team, release our threads */
5449   if (!use_hot_team) {
5450     if (__kmp_tasking_mode != tskm_immediate_exec) {
5451       // Wait for threads to reach reapable state
5452       for (f = 1; f < team->t.t_nproc; ++f) {
5453         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5454         kmp_info_t *th = team->t.t_threads[f];
5455         volatile kmp_uint32 *state = &th->th.th_reap_state;
5456         while (*state != KMP_SAFE_TO_REAP) {
5457 #if KMP_OS_WINDOWS
5458           // On Windows a thread can be killed at any time, check this
5459           DWORD ecode;
5460           if (!__kmp_is_thread_alive(th, &ecode)) {
5461             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5462             break;
5463           }
5464 #endif
5465           // first check if thread is sleeping
5466           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5467           if (fl.is_sleeping())
5468             fl.resume(__kmp_gtid_from_thread(th));
5469           KMP_CPU_PAUSE();
5470         }
5471       }
5472 
5473       // Delete task teams
5474       int tt_idx;
5475       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5476         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5477         if (task_team != NULL) {
5478           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5479             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5480             team->t.t_threads[f]->th.th_task_team = NULL;
5481           }
5482           KA_TRACE(
5483               20,
5484               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5485                __kmp_get_gtid(), task_team, team->t.t_id));
5486 #if KMP_NESTED_HOT_TEAMS
5487           __kmp_free_task_team(master, task_team);
5488 #endif
5489           team->t.t_task_team[tt_idx] = NULL;
5490         }
5491       }
5492     }
5493 
5494     // Reset pointer to parent team only for non-hot teams.
5495     team->t.t_parent = NULL;
5496     team->t.t_level = 0;
5497     team->t.t_active_level = 0;
5498 
5499     /* free the worker threads */
5500     for (f = 1; f < team->t.t_nproc; ++f) {
5501       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5502       __kmp_free_thread(team->t.t_threads[f]);
5503       team->t.t_threads[f] = NULL;
5504     }
5505 
5506     /* put the team back in the team pool */
5507     /* TODO limit size of team pool, call reap_team if pool too large */
5508     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5509     __kmp_team_pool = (volatile kmp_team_t *)team;
5510   } else { // Check if team was created for the masters in a teams construct
5511     // See if first worker is a CG root
5512     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5513                      team->t.t_threads[1]->th.th_cg_roots);
5514     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5515       // Clean up the CG root nodes on workers so that this team can be re-used
5516       for (f = 1; f < team->t.t_nproc; ++f) {
5517         kmp_info_t *thr = team->t.t_threads[f];
5518         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5519                          thr->th.th_cg_roots->cg_root == thr);
5520         // Pop current CG root off list
5521         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5522         thr->th.th_cg_roots = tmp->up;
5523         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5524                        " up to node %p. cg_nthreads was %d\n",
5525                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5526         int i = tmp->cg_nthreads--;
5527         if (i == 1) {
5528           __kmp_free(tmp); // free CG if we are the last thread in it
5529         }
5530         // Restore current task's thread_limit from CG root
5531         if (thr->th.th_cg_roots)
5532           thr->th.th_current_task->td_icvs.thread_limit =
5533               thr->th.th_cg_roots->cg_thread_limit;
5534       }
5535     }
5536   }
5537 
5538   KMP_MB();
5539 }
5540 
5541 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5542 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5543   kmp_team_t *next_pool = team->t.t_next_pool;
5544 
5545   KMP_DEBUG_ASSERT(team);
5546   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5547   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5548   KMP_DEBUG_ASSERT(team->t.t_threads);
5549   KMP_DEBUG_ASSERT(team->t.t_argv);
5550 
5551   /* TODO clean the threads that are a part of this? */
5552 
5553   /* free stuff */
5554   __kmp_free_team_arrays(team);
5555   if (team->t.t_argv != &team->t.t_inline_argv[0])
5556     __kmp_free((void *)team->t.t_argv);
5557   __kmp_free(team);
5558 
5559   KMP_MB();
5560   return next_pool;
5561 }
5562 
5563 // Free the thread.  Don't reap it, just place it on the pool of available
5564 // threads.
5565 //
5566 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5567 // binding for the affinity mechanism to be useful.
5568 //
5569 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5570 // However, we want to avoid a potential performance problem by always
5571 // scanning through the list to find the correct point at which to insert
5572 // the thread (potential N**2 behavior).  To do this we keep track of the
5573 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5574 // With single-level parallelism, threads will always be added to the tail
5575 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5576 // parallelism, all bets are off and we may need to scan through the entire
5577 // free list.
5578 //
5579 // This change also has a potentially large performance benefit, for some
5580 // applications.  Previously, as threads were freed from the hot team, they
5581 // would be placed back on the free list in inverse order.  If the hot team
5582 // grew back to it's original size, then the freed thread would be placed
5583 // back on the hot team in reverse order.  This could cause bad cache
5584 // locality problems on programs where the size of the hot team regularly
5585 // grew and shrunk.
5586 //
5587 // Now, for single-level parallelism, the OMP tid is always == gtid.
5588 void __kmp_free_thread(kmp_info_t *this_th) {
5589   int gtid;
5590   kmp_info_t **scan;
5591 
5592   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5593                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5594 
5595   KMP_DEBUG_ASSERT(this_th);
5596 
5597   // When moving thread to pool, switch thread to wait on own b_go flag, and
5598   // uninitialized (NULL team).
5599   int b;
5600   kmp_balign_t *balign = this_th->th.th_bar;
5601   for (b = 0; b < bs_last_barrier; ++b) {
5602     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5603       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5604     balign[b].bb.team = NULL;
5605     balign[b].bb.leaf_kids = 0;
5606   }
5607   this_th->th.th_task_state = 0;
5608   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5609 
5610   /* put thread back on the free pool */
5611   TCW_PTR(this_th->th.th_team, NULL);
5612   TCW_PTR(this_th->th.th_root, NULL);
5613   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5614 
5615   while (this_th->th.th_cg_roots) {
5616     this_th->th.th_cg_roots->cg_nthreads--;
5617     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5618                    " %p of thread  %p to %d\n",
5619                    this_th, this_th->th.th_cg_roots,
5620                    this_th->th.th_cg_roots->cg_root,
5621                    this_th->th.th_cg_roots->cg_nthreads));
5622     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5623     if (tmp->cg_root == this_th) { // Thread is a cg_root
5624       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5625       KA_TRACE(
5626           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5627       this_th->th.th_cg_roots = tmp->up;
5628       __kmp_free(tmp);
5629     } else { // Worker thread
5630       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5631         __kmp_free(tmp);
5632       }
5633       this_th->th.th_cg_roots = NULL;
5634       break;
5635     }
5636   }
5637 
5638   /* If the implicit task assigned to this thread can be used by other threads
5639    * -> multiple threads can share the data and try to free the task at
5640    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5641    * with higher probability when hot team is disabled but can occurs even when
5642    * the hot team is enabled */
5643   __kmp_free_implicit_task(this_th);
5644   this_th->th.th_current_task = NULL;
5645 
5646   // If the __kmp_thread_pool_insert_pt is already past the new insert
5647   // point, then we need to re-scan the entire list.
5648   gtid = this_th->th.th_info.ds.ds_gtid;
5649   if (__kmp_thread_pool_insert_pt != NULL) {
5650     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5651     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5652       __kmp_thread_pool_insert_pt = NULL;
5653     }
5654   }
5655 
5656   // Scan down the list to find the place to insert the thread.
5657   // scan is the address of a link in the list, possibly the address of
5658   // __kmp_thread_pool itself.
5659   //
5660   // In the absence of nested parallelism, the for loop will have 0 iterations.
5661   if (__kmp_thread_pool_insert_pt != NULL) {
5662     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5663   } else {
5664     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5665   }
5666   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5667        scan = &((*scan)->th.th_next_pool))
5668     ;
5669 
5670   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5671   // to its address.
5672   TCW_PTR(this_th->th.th_next_pool, *scan);
5673   __kmp_thread_pool_insert_pt = *scan = this_th;
5674   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5675                    (this_th->th.th_info.ds.ds_gtid <
5676                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5677   TCW_4(this_th->th.th_in_pool, TRUE);
5678   __kmp_suspend_initialize_thread(this_th);
5679   __kmp_lock_suspend_mx(this_th);
5680   if (this_th->th.th_active == TRUE) {
5681     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5682     this_th->th.th_active_in_pool = TRUE;
5683   }
5684 #if KMP_DEBUG
5685   else {
5686     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5687   }
5688 #endif
5689   __kmp_unlock_suspend_mx(this_th);
5690 
5691   TCW_4(__kmp_nth, __kmp_nth - 1);
5692 
5693 #ifdef KMP_ADJUST_BLOCKTIME
5694   /* Adjust blocktime back to user setting or default if necessary */
5695   /* Middle initialization might never have occurred                */
5696   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5697     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5698     if (__kmp_nth <= __kmp_avail_proc) {
5699       __kmp_zero_bt = FALSE;
5700     }
5701   }
5702 #endif /* KMP_ADJUST_BLOCKTIME */
5703 
5704   KMP_MB();
5705 }
5706 
5707 /* ------------------------------------------------------------------------ */
5708 
5709 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5710   int gtid = this_thr->th.th_info.ds.ds_gtid;
5711   /*    void                 *stack_data;*/
5712   kmp_team_t **volatile pteam;
5713 
5714   KMP_MB();
5715   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5716 
5717   if (__kmp_env_consistency_check) {
5718     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5719   }
5720 
5721 #if OMPT_SUPPORT
5722   ompt_data_t *thread_data;
5723   if (ompt_enabled.enabled) {
5724     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5725     *thread_data = ompt_data_none;
5726 
5727     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5728     this_thr->th.ompt_thread_info.wait_id = 0;
5729     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5730     this_thr->th.ompt_thread_info.parallel_flags = 0;
5731     if (ompt_enabled.ompt_callback_thread_begin) {
5732       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5733           ompt_thread_worker, thread_data);
5734     }
5735     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5736   }
5737 #endif
5738 
5739   /* This is the place where threads wait for work */
5740   while (!TCR_4(__kmp_global.g.g_done)) {
5741     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5742     KMP_MB();
5743 
5744     /* wait for work to do */
5745     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5746 
5747     /* No tid yet since not part of a team */
5748     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5749 
5750 #if OMPT_SUPPORT
5751     if (ompt_enabled.enabled) {
5752       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5753     }
5754 #endif
5755 
5756     pteam = &this_thr->th.th_team;
5757 
5758     /* have we been allocated? */
5759     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5760       /* we were just woken up, so run our new task */
5761       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5762         int rc;
5763         KA_TRACE(20,
5764                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5765                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5766                   (*pteam)->t.t_pkfn));
5767 
5768         updateHWFPControl(*pteam);
5769 
5770 #if OMPT_SUPPORT
5771         if (ompt_enabled.enabled) {
5772           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5773         }
5774 #endif
5775 
5776         rc = (*pteam)->t.t_invoke(gtid);
5777         KMP_ASSERT(rc);
5778 
5779         KMP_MB();
5780         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5781                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5782                       (*pteam)->t.t_pkfn));
5783       }
5784 #if OMPT_SUPPORT
5785       if (ompt_enabled.enabled) {
5786         /* no frame set while outside task */
5787         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5788 
5789         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5790       }
5791 #endif
5792       /* join barrier after parallel region */
5793       __kmp_join_barrier(gtid);
5794     }
5795   }
5796   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5797 
5798 #if OMPT_SUPPORT
5799   if (ompt_enabled.ompt_callback_thread_end) {
5800     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5801   }
5802 #endif
5803 
5804   this_thr->th.th_task_team = NULL;
5805   /* run the destructors for the threadprivate data for this thread */
5806   __kmp_common_destroy_gtid(gtid);
5807 
5808   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5809   KMP_MB();
5810   return this_thr;
5811 }
5812 
5813 /* ------------------------------------------------------------------------ */
5814 
5815 void __kmp_internal_end_dest(void *specific_gtid) {
5816 #if KMP_COMPILER_ICC
5817 #pragma warning(push)
5818 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5819 // significant bits
5820 #endif
5821   // Make sure no significant bits are lost
5822   int gtid = (kmp_intptr_t)specific_gtid - 1;
5823 #if KMP_COMPILER_ICC
5824 #pragma warning(pop)
5825 #endif
5826 
5827   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5828   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5829    * this is because 0 is reserved for the nothing-stored case */
5830 
5831   /* josh: One reason for setting the gtid specific data even when it is being
5832      destroyed by pthread is to allow gtid lookup through thread specific data
5833      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5834      that gets executed in the call to __kmp_internal_end_thread, actually
5835      gets the gtid through the thread specific data.  Setting it here seems
5836      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5837      to run smoothly.
5838      todo: get rid of this after we remove the dependence on
5839      __kmp_gtid_get_specific  */
5840   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5841     __kmp_gtid_set_specific(gtid);
5842 #ifdef KMP_TDATA_GTID
5843   __kmp_gtid = gtid;
5844 #endif
5845   __kmp_internal_end_thread(gtid);
5846 }
5847 
5848 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5849 
5850 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5851   __kmp_internal_end_atexit();
5852 }
5853 
5854 #endif
5855 
5856 /* [Windows] josh: when the atexit handler is called, there may still be more
5857    than one thread alive */
5858 void __kmp_internal_end_atexit(void) {
5859   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5860   /* [Windows]
5861      josh: ideally, we want to completely shutdown the library in this atexit
5862      handler, but stat code that depends on thread specific data for gtid fails
5863      because that data becomes unavailable at some point during the shutdown, so
5864      we call __kmp_internal_end_thread instead. We should eventually remove the
5865      dependency on __kmp_get_specific_gtid in the stat code and use
5866      __kmp_internal_end_library to cleanly shutdown the library.
5867 
5868      // TODO: Can some of this comment about GVS be removed?
5869      I suspect that the offending stat code is executed when the calling thread
5870      tries to clean up a dead root thread's data structures, resulting in GVS
5871      code trying to close the GVS structures for that thread, but since the stat
5872      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5873      the calling thread is cleaning up itself instead of another thread, it get
5874      confused. This happens because allowing a thread to unregister and cleanup
5875      another thread is a recent modification for addressing an issue.
5876      Based on the current design (20050722), a thread may end up
5877      trying to unregister another thread only if thread death does not trigger
5878      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5879      thread specific data destructor function to detect thread death. For
5880      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5881      is nothing.  Thus, the workaround is applicable only for Windows static
5882      stat library. */
5883   __kmp_internal_end_library(-1);
5884 #if KMP_OS_WINDOWS
5885   __kmp_close_console();
5886 #endif
5887 }
5888 
5889 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5890   // It is assumed __kmp_forkjoin_lock is acquired.
5891 
5892   int gtid;
5893 
5894   KMP_DEBUG_ASSERT(thread != NULL);
5895 
5896   gtid = thread->th.th_info.ds.ds_gtid;
5897 
5898   if (!is_root) {
5899     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5900       /* Assume the threads are at the fork barrier here */
5901       KA_TRACE(
5902           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5903                gtid));
5904       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5905        * (GEH) */
5906       ANNOTATE_HAPPENS_BEFORE(thread);
5907       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5908       __kmp_release_64(&flag);
5909     }
5910 
5911     // Terminate OS thread.
5912     __kmp_reap_worker(thread);
5913 
5914     // The thread was killed asynchronously.  If it was actively
5915     // spinning in the thread pool, decrement the global count.
5916     //
5917     // There is a small timing hole here - if the worker thread was just waking
5918     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5919     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5920     // the global counter might not get updated.
5921     //
5922     // Currently, this can only happen as the library is unloaded,
5923     // so there are no harmful side effects.
5924     if (thread->th.th_active_in_pool) {
5925       thread->th.th_active_in_pool = FALSE;
5926       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5927       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5928     }
5929   }
5930 
5931   __kmp_free_implicit_task(thread);
5932 
5933 // Free the fast memory for tasking
5934 #if USE_FAST_MEMORY
5935   __kmp_free_fast_memory(thread);
5936 #endif /* USE_FAST_MEMORY */
5937 
5938   __kmp_suspend_uninitialize_thread(thread);
5939 
5940   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5941   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5942 
5943   --__kmp_all_nth;
5944 // __kmp_nth was decremented when thread is added to the pool.
5945 
5946 #ifdef KMP_ADJUST_BLOCKTIME
5947   /* Adjust blocktime back to user setting or default if necessary */
5948   /* Middle initialization might never have occurred                */
5949   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5950     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5951     if (__kmp_nth <= __kmp_avail_proc) {
5952       __kmp_zero_bt = FALSE;
5953     }
5954   }
5955 #endif /* KMP_ADJUST_BLOCKTIME */
5956 
5957   /* free the memory being used */
5958   if (__kmp_env_consistency_check) {
5959     if (thread->th.th_cons) {
5960       __kmp_free_cons_stack(thread->th.th_cons);
5961       thread->th.th_cons = NULL;
5962     }
5963   }
5964 
5965   if (thread->th.th_pri_common != NULL) {
5966     __kmp_free(thread->th.th_pri_common);
5967     thread->th.th_pri_common = NULL;
5968   }
5969 
5970   if (thread->th.th_task_state_memo_stack != NULL) {
5971     __kmp_free(thread->th.th_task_state_memo_stack);
5972     thread->th.th_task_state_memo_stack = NULL;
5973   }
5974 
5975 #if KMP_USE_BGET
5976   if (thread->th.th_local.bget_data != NULL) {
5977     __kmp_finalize_bget(thread);
5978   }
5979 #endif
5980 
5981 #if KMP_AFFINITY_SUPPORTED
5982   if (thread->th.th_affin_mask != NULL) {
5983     KMP_CPU_FREE(thread->th.th_affin_mask);
5984     thread->th.th_affin_mask = NULL;
5985   }
5986 #endif /* KMP_AFFINITY_SUPPORTED */
5987 
5988 #if KMP_USE_HIER_SCHED
5989   if (thread->th.th_hier_bar_data != NULL) {
5990     __kmp_free(thread->th.th_hier_bar_data);
5991     thread->th.th_hier_bar_data = NULL;
5992   }
5993 #endif
5994 
5995   __kmp_reap_team(thread->th.th_serial_team);
5996   thread->th.th_serial_team = NULL;
5997   __kmp_free(thread);
5998 
5999   KMP_MB();
6000 
6001 } // __kmp_reap_thread
6002 
6003 static void __kmp_internal_end(void) {
6004   int i;
6005 
6006   /* First, unregister the library */
6007   __kmp_unregister_library();
6008 
6009 #if KMP_OS_WINDOWS
6010   /* In Win static library, we can't tell when a root actually dies, so we
6011      reclaim the data structures for any root threads that have died but not
6012      unregistered themselves, in order to shut down cleanly.
6013      In Win dynamic library we also can't tell when a thread dies.  */
6014   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6015 // dead roots
6016 #endif
6017 
6018   for (i = 0; i < __kmp_threads_capacity; i++)
6019     if (__kmp_root[i])
6020       if (__kmp_root[i]->r.r_active)
6021         break;
6022   KMP_MB(); /* Flush all pending memory write invalidates.  */
6023   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6024 
6025   if (i < __kmp_threads_capacity) {
6026 #if KMP_USE_MONITOR
6027     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6028     KMP_MB(); /* Flush all pending memory write invalidates.  */
6029 
6030     // Need to check that monitor was initialized before reaping it. If we are
6031     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6032     // __kmp_monitor will appear to contain valid data, but it is only valid in
6033     // the parent process, not the child.
6034     // New behavior (201008): instead of keying off of the flag
6035     // __kmp_init_parallel, the monitor thread creation is keyed off
6036     // of the new flag __kmp_init_monitor.
6037     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6038     if (TCR_4(__kmp_init_monitor)) {
6039       __kmp_reap_monitor(&__kmp_monitor);
6040       TCW_4(__kmp_init_monitor, 0);
6041     }
6042     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6043     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6044 #endif // KMP_USE_MONITOR
6045   } else {
6046 /* TODO move this to cleanup code */
6047 #ifdef KMP_DEBUG
6048     /* make sure that everything has properly ended */
6049     for (i = 0; i < __kmp_threads_capacity; i++) {
6050       if (__kmp_root[i]) {
6051         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6052         //                    there can be uber threads alive here
6053         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6054       }
6055     }
6056 #endif
6057 
6058     KMP_MB();
6059 
6060     // Reap the worker threads.
6061     // This is valid for now, but be careful if threads are reaped sooner.
6062     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6063       // Get the next thread from the pool.
6064       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6065       __kmp_thread_pool = thread->th.th_next_pool;
6066       // Reap it.
6067       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6068       thread->th.th_next_pool = NULL;
6069       thread->th.th_in_pool = FALSE;
6070       __kmp_reap_thread(thread, 0);
6071     }
6072     __kmp_thread_pool_insert_pt = NULL;
6073 
6074     // Reap teams.
6075     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6076       // Get the next team from the pool.
6077       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6078       __kmp_team_pool = team->t.t_next_pool;
6079       // Reap it.
6080       team->t.t_next_pool = NULL;
6081       __kmp_reap_team(team);
6082     }
6083 
6084     __kmp_reap_task_teams();
6085 
6086 #if KMP_OS_UNIX
6087     // Threads that are not reaped should not access any resources since they
6088     // are going to be deallocated soon, so the shutdown sequence should wait
6089     // until all threads either exit the final spin-waiting loop or begin
6090     // sleeping after the given blocktime.
6091     for (i = 0; i < __kmp_threads_capacity; i++) {
6092       kmp_info_t *thr = __kmp_threads[i];
6093       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6094         KMP_CPU_PAUSE();
6095     }
6096 #endif
6097 
6098     for (i = 0; i < __kmp_threads_capacity; ++i) {
6099       // TBD: Add some checking...
6100       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6101     }
6102 
6103     /* Make sure all threadprivate destructors get run by joining with all
6104        worker threads before resetting this flag */
6105     TCW_SYNC_4(__kmp_init_common, FALSE);
6106 
6107     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6108     KMP_MB();
6109 
6110 #if KMP_USE_MONITOR
6111     // See note above: One of the possible fixes for CQ138434 / CQ140126
6112     //
6113     // FIXME: push both code fragments down and CSE them?
6114     // push them into __kmp_cleanup() ?
6115     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6116     if (TCR_4(__kmp_init_monitor)) {
6117       __kmp_reap_monitor(&__kmp_monitor);
6118       TCW_4(__kmp_init_monitor, 0);
6119     }
6120     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6121     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6122 #endif
6123   } /* else !__kmp_global.t_active */
6124   TCW_4(__kmp_init_gtid, FALSE);
6125   KMP_MB(); /* Flush all pending memory write invalidates.  */
6126 
6127   __kmp_cleanup();
6128 #if OMPT_SUPPORT
6129   ompt_fini();
6130 #endif
6131 }
6132 
6133 void __kmp_internal_end_library(int gtid_req) {
6134   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6135   /* this shouldn't be a race condition because __kmp_internal_end() is the
6136      only place to clear __kmp_serial_init */
6137   /* we'll check this later too, after we get the lock */
6138   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6139   // redundant, because the next check will work in any case.
6140   if (__kmp_global.g.g_abort) {
6141     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6142     /* TODO abort? */
6143     return;
6144   }
6145   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6146     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6147     return;
6148   }
6149 
6150   KMP_MB(); /* Flush all pending memory write invalidates.  */
6151   /* find out who we are and what we should do */
6152   {
6153     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6154     KA_TRACE(
6155         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6156     if (gtid == KMP_GTID_SHUTDOWN) {
6157       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6158                     "already shutdown\n"));
6159       return;
6160     } else if (gtid == KMP_GTID_MONITOR) {
6161       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6162                     "registered, or system shutdown\n"));
6163       return;
6164     } else if (gtid == KMP_GTID_DNE) {
6165       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6166                     "shutdown\n"));
6167       /* we don't know who we are, but we may still shutdown the library */
6168     } else if (KMP_UBER_GTID(gtid)) {
6169       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6170       if (__kmp_root[gtid]->r.r_active) {
6171         __kmp_global.g.g_abort = -1;
6172         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6173         KA_TRACE(10,
6174                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6175                   gtid));
6176         return;
6177       } else {
6178         KA_TRACE(
6179             10,
6180             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6181         __kmp_unregister_root_current_thread(gtid);
6182       }
6183     } else {
6184 /* worker threads may call this function through the atexit handler, if they
6185  * call exit() */
6186 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6187    TODO: do a thorough shutdown instead */
6188 #ifdef DUMP_DEBUG_ON_EXIT
6189       if (__kmp_debug_buf)
6190         __kmp_dump_debug_buffer();
6191 #endif
6192       // added unregister library call here when we switch to shm linux
6193       // if we don't, it will leave lots of files in /dev/shm
6194       // cleanup shared memory file before exiting.
6195       __kmp_unregister_library();
6196       return;
6197     }
6198   }
6199   /* synchronize the termination process */
6200   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6201 
6202   /* have we already finished */
6203   if (__kmp_global.g.g_abort) {
6204     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6205     /* TODO abort? */
6206     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6207     return;
6208   }
6209   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6210     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6211     return;
6212   }
6213 
6214   /* We need this lock to enforce mutex between this reading of
6215      __kmp_threads_capacity and the writing by __kmp_register_root.
6216      Alternatively, we can use a counter of roots that is atomically updated by
6217      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6218      __kmp_internal_end_*.  */
6219   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6220 
6221   /* now we can safely conduct the actual termination */
6222   __kmp_internal_end();
6223 
6224   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6225   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6226 
6227   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6228 
6229 #ifdef DUMP_DEBUG_ON_EXIT
6230   if (__kmp_debug_buf)
6231     __kmp_dump_debug_buffer();
6232 #endif
6233 
6234 #if KMP_OS_WINDOWS
6235   __kmp_close_console();
6236 #endif
6237 
6238   __kmp_fini_allocator();
6239 
6240 } // __kmp_internal_end_library
6241 
6242 void __kmp_internal_end_thread(int gtid_req) {
6243   int i;
6244 
6245   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6246   /* this shouldn't be a race condition because __kmp_internal_end() is the
6247    * only place to clear __kmp_serial_init */
6248   /* we'll check this later too, after we get the lock */
6249   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6250   // redundant, because the next check will work in any case.
6251   if (__kmp_global.g.g_abort) {
6252     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6253     /* TODO abort? */
6254     return;
6255   }
6256   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6257     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6258     return;
6259   }
6260 
6261   KMP_MB(); /* Flush all pending memory write invalidates.  */
6262 
6263   /* find out who we are and what we should do */
6264   {
6265     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6266     KA_TRACE(10,
6267              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6268     if (gtid == KMP_GTID_SHUTDOWN) {
6269       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6270                     "already shutdown\n"));
6271       return;
6272     } else if (gtid == KMP_GTID_MONITOR) {
6273       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6274                     "registered, or system shutdown\n"));
6275       return;
6276     } else if (gtid == KMP_GTID_DNE) {
6277       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6278                     "shutdown\n"));
6279       return;
6280       /* we don't know who we are */
6281     } else if (KMP_UBER_GTID(gtid)) {
6282       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6283       if (__kmp_root[gtid]->r.r_active) {
6284         __kmp_global.g.g_abort = -1;
6285         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6286         KA_TRACE(10,
6287                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6288                   gtid));
6289         return;
6290       } else {
6291         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6292                       gtid));
6293         __kmp_unregister_root_current_thread(gtid);
6294       }
6295     } else {
6296       /* just a worker thread, let's leave */
6297       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6298 
6299       if (gtid >= 0) {
6300         __kmp_threads[gtid]->th.th_task_team = NULL;
6301       }
6302 
6303       KA_TRACE(10,
6304                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6305                 gtid));
6306       return;
6307     }
6308   }
6309 #if KMP_DYNAMIC_LIB
6310   if (__kmp_pause_status != kmp_hard_paused)
6311   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6312   // because we will better shutdown later in the library destructor.
6313   {
6314     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6315     return;
6316   }
6317 #endif
6318   /* synchronize the termination process */
6319   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6320 
6321   /* have we already finished */
6322   if (__kmp_global.g.g_abort) {
6323     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6324     /* TODO abort? */
6325     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6326     return;
6327   }
6328   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6329     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6330     return;
6331   }
6332 
6333   /* We need this lock to enforce mutex between this reading of
6334      __kmp_threads_capacity and the writing by __kmp_register_root.
6335      Alternatively, we can use a counter of roots that is atomically updated by
6336      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6337      __kmp_internal_end_*.  */
6338 
6339   /* should we finish the run-time?  are all siblings done? */
6340   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6341 
6342   for (i = 0; i < __kmp_threads_capacity; ++i) {
6343     if (KMP_UBER_GTID(i)) {
6344       KA_TRACE(
6345           10,
6346           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6347       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6348       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6349       return;
6350     }
6351   }
6352 
6353   /* now we can safely conduct the actual termination */
6354 
6355   __kmp_internal_end();
6356 
6357   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6358   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6359 
6360   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6361 
6362 #ifdef DUMP_DEBUG_ON_EXIT
6363   if (__kmp_debug_buf)
6364     __kmp_dump_debug_buffer();
6365 #endif
6366 } // __kmp_internal_end_thread
6367 
6368 // -----------------------------------------------------------------------------
6369 // Library registration stuff.
6370 
6371 static long __kmp_registration_flag = 0;
6372 // Random value used to indicate library initialization.
6373 static char *__kmp_registration_str = NULL;
6374 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6375 
6376 static inline char *__kmp_reg_status_name() {
6377   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6378      each thread. If registration and unregistration go in different threads
6379      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6380      env var can not be found, because the name will contain different pid. */
6381   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6382 } // __kmp_reg_status_get
6383 
6384 void __kmp_register_library_startup(void) {
6385 
6386   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6387   int done = 0;
6388   union {
6389     double dtime;
6390     long ltime;
6391   } time;
6392 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6393   __kmp_initialize_system_tick();
6394 #endif
6395   __kmp_read_system_time(&time.dtime);
6396   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6397   __kmp_registration_str =
6398       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6399                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6400 
6401   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6402                 __kmp_registration_str));
6403 
6404   while (!done) {
6405 
6406     char *value = NULL; // Actual value of the environment variable.
6407 
6408 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6409     char *shm_name = __kmp_str_format("/%s", name);
6410     int shm_preexist = 0;
6411     char *data1;
6412     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6413     if ((fd1 == -1) && (errno == EEXIST)) {
6414       // file didn't open because it already exists.
6415       // try opening existing file
6416       fd1 = shm_open(shm_name, O_RDWR, 0666);
6417       if (fd1 == -1) { // file didn't open
6418         // error out here
6419         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6420                     __kmp_msg_null);
6421       } else {
6422         // able to open existing file
6423         shm_preexist = 1;
6424       }
6425     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6426       // already exists.
6427       // error out here.
6428       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6429                   __kmp_msg_null);
6430     }
6431     if (shm_preexist == 0) {
6432       // we created SHM now set size
6433       if (ftruncate(fd1, SHM_SIZE) == -1) {
6434         // error occured setting size;
6435         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6436                     KMP_ERR(errno), __kmp_msg_null);
6437       }
6438     }
6439     data1 =
6440         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6441     if (data1 == MAP_FAILED) {
6442       // failed to map shared memory
6443       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6444                   __kmp_msg_null);
6445     }
6446     if (shm_preexist == 0) { // set data to SHM, set value
6447       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6448     }
6449     // Read value from either what we just wrote or existing file.
6450     value = __kmp_str_format("%s", data1); // read value from SHM
6451     munmap(data1, SHM_SIZE);
6452     close(fd1);
6453 #else // Windows and unix with static library
6454     // Set environment variable, but do not overwrite if it is exist.
6455     __kmp_env_set(name, __kmp_registration_str, 0);
6456     // read value to see if it got set
6457     value = __kmp_env_get(name);
6458 #endif
6459 
6460     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6461       done = 1; // Ok, environment variable set successfully, exit the loop.
6462     } else {
6463       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6464       // Check whether it alive or dead.
6465       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6466       char *tail = value;
6467       char *flag_addr_str = NULL;
6468       char *flag_val_str = NULL;
6469       char const *file_name = NULL;
6470       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6471       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6472       file_name = tail;
6473       if (tail != NULL) {
6474         long *flag_addr = 0;
6475         long flag_val = 0;
6476         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6477         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6478         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6479           // First, check whether environment-encoded address is mapped into
6480           // addr space.
6481           // If so, dereference it to see if it still has the right value.
6482           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6483             neighbor = 1;
6484           } else {
6485             // If not, then we know the other copy of the library is no longer
6486             // running.
6487             neighbor = 2;
6488           }
6489         }
6490       }
6491       switch (neighbor) {
6492       case 0: // Cannot parse environment variable -- neighbor status unknown.
6493         // Assume it is the incompatible format of future version of the
6494         // library. Assume the other library is alive.
6495         // WARN( ... ); // TODO: Issue a warning.
6496         file_name = "unknown library";
6497         KMP_FALLTHROUGH();
6498       // Attention! Falling to the next case. That's intentional.
6499       case 1: { // Neighbor is alive.
6500         // Check it is allowed.
6501         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6502         if (!__kmp_str_match_true(duplicate_ok)) {
6503           // That's not allowed. Issue fatal error.
6504           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6505                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6506         }
6507         KMP_INTERNAL_FREE(duplicate_ok);
6508         __kmp_duplicate_library_ok = 1;
6509         done = 1; // Exit the loop.
6510       } break;
6511       case 2: { // Neighbor is dead.
6512 
6513 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6514         // close shared memory.
6515         shm_unlink(shm_name); // this removes file in /dev/shm
6516 #else
6517         // Clear the variable and try to register library again.
6518         __kmp_env_unset(name);
6519 #endif
6520       } break;
6521       default: { KMP_DEBUG_ASSERT(0); } break;
6522       }
6523     }
6524     KMP_INTERNAL_FREE((void *)value);
6525 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6526     KMP_INTERNAL_FREE((void *)shm_name);
6527 #endif
6528   } // while
6529   KMP_INTERNAL_FREE((void *)name);
6530 
6531 } // func __kmp_register_library_startup
6532 
6533 void __kmp_unregister_library(void) {
6534 
6535   char *name = __kmp_reg_status_name();
6536   char *value = NULL;
6537 
6538 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6539   char *shm_name = __kmp_str_format("/%s", name);
6540   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6541   if (fd1 == -1) {
6542     // file did not open. return.
6543     return;
6544   }
6545   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6546   if (data1 != MAP_FAILED) {
6547     value = __kmp_str_format("%s", data1); // read value from SHM
6548     munmap(data1, SHM_SIZE);
6549   }
6550   close(fd1);
6551 #else
6552   value = __kmp_env_get(name);
6553 #endif
6554 
6555   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6556   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6557   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6558 //  Ok, this is our variable. Delete it.
6559 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6560     shm_unlink(shm_name); // this removes file in /dev/shm
6561 #else
6562     __kmp_env_unset(name);
6563 #endif
6564   }
6565 
6566 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6567   KMP_INTERNAL_FREE(shm_name);
6568 #endif
6569 
6570   KMP_INTERNAL_FREE(__kmp_registration_str);
6571   KMP_INTERNAL_FREE(value);
6572   KMP_INTERNAL_FREE(name);
6573 
6574   __kmp_registration_flag = 0;
6575   __kmp_registration_str = NULL;
6576 
6577 } // __kmp_unregister_library
6578 
6579 // End of Library registration stuff.
6580 // -----------------------------------------------------------------------------
6581 
6582 #if KMP_MIC_SUPPORTED
6583 
6584 static void __kmp_check_mic_type() {
6585   kmp_cpuid_t cpuid_state = {0};
6586   kmp_cpuid_t *cs_p = &cpuid_state;
6587   __kmp_x86_cpuid(1, 0, cs_p);
6588   // We don't support mic1 at the moment
6589   if ((cs_p->eax & 0xff0) == 0xB10) {
6590     __kmp_mic_type = mic2;
6591   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6592     __kmp_mic_type = mic3;
6593   } else {
6594     __kmp_mic_type = non_mic;
6595   }
6596 }
6597 
6598 #endif /* KMP_MIC_SUPPORTED */
6599 
6600 static void __kmp_do_serial_initialize(void) {
6601   int i, gtid;
6602   int size;
6603 
6604   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6605 
6606   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6607   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6608   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6609   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6610   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6611 
6612 #if OMPT_SUPPORT
6613   ompt_pre_init();
6614 #endif
6615 
6616   __kmp_validate_locks();
6617 
6618   /* Initialize internal memory allocator */
6619   __kmp_init_allocator();
6620 
6621   /* Register the library startup via an environment variable and check to see
6622      whether another copy of the library is already registered. */
6623 
6624   __kmp_register_library_startup();
6625 
6626   /* TODO reinitialization of library */
6627   if (TCR_4(__kmp_global.g.g_done)) {
6628     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6629   }
6630 
6631   __kmp_global.g.g_abort = 0;
6632   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6633 
6634 /* initialize the locks */
6635 #if KMP_USE_ADAPTIVE_LOCKS
6636 #if KMP_DEBUG_ADAPTIVE_LOCKS
6637   __kmp_init_speculative_stats();
6638 #endif
6639 #endif
6640 #if KMP_STATS_ENABLED
6641   __kmp_stats_init();
6642 #endif
6643   __kmp_init_lock(&__kmp_global_lock);
6644   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6645   __kmp_init_lock(&__kmp_debug_lock);
6646   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6647   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6648   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6649   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6650   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6651   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6652   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6653   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6654   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6655   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6656   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6657   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6658   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6659   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6660   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6661 #if KMP_USE_MONITOR
6662   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6663 #endif
6664   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6665 
6666   /* conduct initialization and initial setup of configuration */
6667 
6668   __kmp_runtime_initialize();
6669 
6670 #if KMP_MIC_SUPPORTED
6671   __kmp_check_mic_type();
6672 #endif
6673 
6674 // Some global variable initialization moved here from kmp_env_initialize()
6675 #ifdef KMP_DEBUG
6676   kmp_diag = 0;
6677 #endif
6678   __kmp_abort_delay = 0;
6679 
6680   // From __kmp_init_dflt_team_nth()
6681   /* assume the entire machine will be used */
6682   __kmp_dflt_team_nth_ub = __kmp_xproc;
6683   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6684     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6685   }
6686   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6687     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6688   }
6689   __kmp_max_nth = __kmp_sys_max_nth;
6690   __kmp_cg_max_nth = __kmp_sys_max_nth;
6691   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6692   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6693     __kmp_teams_max_nth = __kmp_sys_max_nth;
6694   }
6695 
6696   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6697   // part
6698   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6699 #if KMP_USE_MONITOR
6700   __kmp_monitor_wakeups =
6701       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6702   __kmp_bt_intervals =
6703       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6704 #endif
6705   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6706   __kmp_library = library_throughput;
6707   // From KMP_SCHEDULE initialization
6708   __kmp_static = kmp_sch_static_balanced;
6709 // AC: do not use analytical here, because it is non-monotonous
6710 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6711 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6712 // need to repeat assignment
6713 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6714 // bit control and barrier method control parts
6715 #if KMP_FAST_REDUCTION_BARRIER
6716 #define kmp_reduction_barrier_gather_bb ((int)1)
6717 #define kmp_reduction_barrier_release_bb ((int)1)
6718 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6719 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6720 #endif // KMP_FAST_REDUCTION_BARRIER
6721   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6722     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6723     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6724     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6725     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6726 #if KMP_FAST_REDUCTION_BARRIER
6727     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6728       // lin_64 ): hyper,1
6729       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6730       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6731       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6732       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6733     }
6734 #endif // KMP_FAST_REDUCTION_BARRIER
6735   }
6736 #if KMP_FAST_REDUCTION_BARRIER
6737 #undef kmp_reduction_barrier_release_pat
6738 #undef kmp_reduction_barrier_gather_pat
6739 #undef kmp_reduction_barrier_release_bb
6740 #undef kmp_reduction_barrier_gather_bb
6741 #endif // KMP_FAST_REDUCTION_BARRIER
6742 #if KMP_MIC_SUPPORTED
6743   if (__kmp_mic_type == mic2) { // KNC
6744     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6745     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6746     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6747         1; // forkjoin release
6748     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6749     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6750   }
6751 #if KMP_FAST_REDUCTION_BARRIER
6752   if (__kmp_mic_type == mic2) { // KNC
6753     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6754     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6755   }
6756 #endif // KMP_FAST_REDUCTION_BARRIER
6757 #endif // KMP_MIC_SUPPORTED
6758 
6759 // From KMP_CHECKS initialization
6760 #ifdef KMP_DEBUG
6761   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6762 #else
6763   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6764 #endif
6765 
6766   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6767   __kmp_foreign_tp = TRUE;
6768 
6769   __kmp_global.g.g_dynamic = FALSE;
6770   __kmp_global.g.g_dynamic_mode = dynamic_default;
6771 
6772   __kmp_env_initialize(NULL);
6773 
6774 // Print all messages in message catalog for testing purposes.
6775 #ifdef KMP_DEBUG
6776   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6777   if (__kmp_str_match_true(val)) {
6778     kmp_str_buf_t buffer;
6779     __kmp_str_buf_init(&buffer);
6780     __kmp_i18n_dump_catalog(&buffer);
6781     __kmp_printf("%s", buffer.str);
6782     __kmp_str_buf_free(&buffer);
6783   }
6784   __kmp_env_free(&val);
6785 #endif
6786 
6787   __kmp_threads_capacity =
6788       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6789   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6790   __kmp_tp_capacity = __kmp_default_tp_capacity(
6791       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6792 
6793   // If the library is shut down properly, both pools must be NULL. Just in
6794   // case, set them to NULL -- some memory may leak, but subsequent code will
6795   // work even if pools are not freed.
6796   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6797   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6798   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6799   __kmp_thread_pool = NULL;
6800   __kmp_thread_pool_insert_pt = NULL;
6801   __kmp_team_pool = NULL;
6802 
6803   /* Allocate all of the variable sized records */
6804   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6805    * expandable */
6806   /* Since allocation is cache-aligned, just add extra padding at the end */
6807   size =
6808       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6809       CACHE_LINE;
6810   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6811   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6812                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6813 
6814   /* init thread counts */
6815   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6816                    0); // Asserts fail if the library is reinitializing and
6817   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6818   __kmp_all_nth = 0;
6819   __kmp_nth = 0;
6820 
6821   /* setup the uber master thread and hierarchy */
6822   gtid = __kmp_register_root(TRUE);
6823   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6824   KMP_ASSERT(KMP_UBER_GTID(gtid));
6825   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6826 
6827   KMP_MB(); /* Flush all pending memory write invalidates.  */
6828 
6829   __kmp_common_initialize();
6830 
6831 #if KMP_OS_UNIX
6832   /* invoke the child fork handler */
6833   __kmp_register_atfork();
6834 #endif
6835 
6836 #if !KMP_DYNAMIC_LIB
6837   {
6838     /* Invoke the exit handler when the program finishes, only for static
6839        library. For dynamic library, we already have _fini and DllMain. */
6840     int rc = atexit(__kmp_internal_end_atexit);
6841     if (rc != 0) {
6842       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6843                   __kmp_msg_null);
6844     }
6845   }
6846 #endif
6847 
6848 #if KMP_HANDLE_SIGNALS
6849 #if KMP_OS_UNIX
6850   /* NOTE: make sure that this is called before the user installs their own
6851      signal handlers so that the user handlers are called first. this way they
6852      can return false, not call our handler, avoid terminating the library, and
6853      continue execution where they left off. */
6854   __kmp_install_signals(FALSE);
6855 #endif /* KMP_OS_UNIX */
6856 #if KMP_OS_WINDOWS
6857   __kmp_install_signals(TRUE);
6858 #endif /* KMP_OS_WINDOWS */
6859 #endif
6860 
6861   /* we have finished the serial initialization */
6862   __kmp_init_counter++;
6863 
6864   __kmp_init_serial = TRUE;
6865 
6866   if (__kmp_settings) {
6867     __kmp_env_print();
6868   }
6869 
6870   if (__kmp_display_env || __kmp_display_env_verbose) {
6871     __kmp_env_print_2();
6872   }
6873 
6874 #if OMPT_SUPPORT
6875   ompt_post_init();
6876 #endif
6877 
6878   KMP_MB();
6879 
6880   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6881 }
6882 
6883 void __kmp_serial_initialize(void) {
6884   if (__kmp_init_serial) {
6885     return;
6886   }
6887   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6888   if (__kmp_init_serial) {
6889     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6890     return;
6891   }
6892   __kmp_do_serial_initialize();
6893   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6894 }
6895 
6896 static void __kmp_do_middle_initialize(void) {
6897   int i, j;
6898   int prev_dflt_team_nth;
6899 
6900   if (!__kmp_init_serial) {
6901     __kmp_do_serial_initialize();
6902   }
6903 
6904   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6905 
6906   // Save the previous value for the __kmp_dflt_team_nth so that
6907   // we can avoid some reinitialization if it hasn't changed.
6908   prev_dflt_team_nth = __kmp_dflt_team_nth;
6909 
6910 #if KMP_AFFINITY_SUPPORTED
6911   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6912   // number of cores on the machine.
6913   __kmp_affinity_initialize();
6914 
6915   // Run through the __kmp_threads array and set the affinity mask
6916   // for each root thread that is currently registered with the RTL.
6917   for (i = 0; i < __kmp_threads_capacity; i++) {
6918     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6919       __kmp_affinity_set_init_mask(i, TRUE);
6920     }
6921   }
6922 #endif /* KMP_AFFINITY_SUPPORTED */
6923 
6924   KMP_ASSERT(__kmp_xproc > 0);
6925   if (__kmp_avail_proc == 0) {
6926     __kmp_avail_proc = __kmp_xproc;
6927   }
6928 
6929   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6930   // correct them now
6931   j = 0;
6932   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6933     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6934         __kmp_avail_proc;
6935     j++;
6936   }
6937 
6938   if (__kmp_dflt_team_nth == 0) {
6939 #ifdef KMP_DFLT_NTH_CORES
6940     // Default #threads = #cores
6941     __kmp_dflt_team_nth = __kmp_ncores;
6942     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6943                   "__kmp_ncores (%d)\n",
6944                   __kmp_dflt_team_nth));
6945 #else
6946     // Default #threads = #available OS procs
6947     __kmp_dflt_team_nth = __kmp_avail_proc;
6948     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6949                   "__kmp_avail_proc(%d)\n",
6950                   __kmp_dflt_team_nth));
6951 #endif /* KMP_DFLT_NTH_CORES */
6952   }
6953 
6954   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6955     __kmp_dflt_team_nth = KMP_MIN_NTH;
6956   }
6957   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6958     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6959   }
6960 
6961   // There's no harm in continuing if the following check fails,
6962   // but it indicates an error in the previous logic.
6963   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6964 
6965   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6966     // Run through the __kmp_threads array and set the num threads icv for each
6967     // root thread that is currently registered with the RTL (which has not
6968     // already explicitly set its nthreads-var with a call to
6969     // omp_set_num_threads()).
6970     for (i = 0; i < __kmp_threads_capacity; i++) {
6971       kmp_info_t *thread = __kmp_threads[i];
6972       if (thread == NULL)
6973         continue;
6974       if (thread->th.th_current_task->td_icvs.nproc != 0)
6975         continue;
6976 
6977       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6978     }
6979   }
6980   KA_TRACE(
6981       20,
6982       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6983        __kmp_dflt_team_nth));
6984 
6985 #ifdef KMP_ADJUST_BLOCKTIME
6986   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6987   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6988     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6989     if (__kmp_nth > __kmp_avail_proc) {
6990       __kmp_zero_bt = TRUE;
6991     }
6992   }
6993 #endif /* KMP_ADJUST_BLOCKTIME */
6994 
6995   /* we have finished middle initialization */
6996   TCW_SYNC_4(__kmp_init_middle, TRUE);
6997 
6998   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6999 }
7000 
7001 void __kmp_middle_initialize(void) {
7002   if (__kmp_init_middle) {
7003     return;
7004   }
7005   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7006   if (__kmp_init_middle) {
7007     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7008     return;
7009   }
7010   __kmp_do_middle_initialize();
7011   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7012 }
7013 
7014 void __kmp_parallel_initialize(void) {
7015   int gtid = __kmp_entry_gtid(); // this might be a new root
7016 
7017   /* synchronize parallel initialization (for sibling) */
7018   if (TCR_4(__kmp_init_parallel))
7019     return;
7020   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7021   if (TCR_4(__kmp_init_parallel)) {
7022     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7023     return;
7024   }
7025 
7026   /* TODO reinitialization after we have already shut down */
7027   if (TCR_4(__kmp_global.g.g_done)) {
7028     KA_TRACE(
7029         10,
7030         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7031     __kmp_infinite_loop();
7032   }
7033 
7034   /* jc: The lock __kmp_initz_lock is already held, so calling
7035      __kmp_serial_initialize would cause a deadlock.  So we call
7036      __kmp_do_serial_initialize directly. */
7037   if (!__kmp_init_middle) {
7038     __kmp_do_middle_initialize();
7039   }
7040   __kmp_resume_if_hard_paused();
7041 
7042   /* begin initialization */
7043   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7044   KMP_ASSERT(KMP_UBER_GTID(gtid));
7045 
7046 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7047   // Save the FP control regs.
7048   // Worker threads will set theirs to these values at thread startup.
7049   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7050   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7051   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7052 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7053 
7054 #if KMP_OS_UNIX
7055 #if KMP_HANDLE_SIGNALS
7056   /*  must be after __kmp_serial_initialize  */
7057   __kmp_install_signals(TRUE);
7058 #endif
7059 #endif
7060 
7061   __kmp_suspend_initialize();
7062 
7063 #if defined(USE_LOAD_BALANCE)
7064   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7065     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7066   }
7067 #else
7068   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7069     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7070   }
7071 #endif
7072 
7073   if (__kmp_version) {
7074     __kmp_print_version_2();
7075   }
7076 
7077   /* we have finished parallel initialization */
7078   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7079 
7080   KMP_MB();
7081   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7082 
7083   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7084 }
7085 
7086 /* ------------------------------------------------------------------------ */
7087 
7088 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7089                                    kmp_team_t *team) {
7090   kmp_disp_t *dispatch;
7091 
7092   KMP_MB();
7093 
7094   /* none of the threads have encountered any constructs, yet. */
7095   this_thr->th.th_local.this_construct = 0;
7096 #if KMP_CACHE_MANAGE
7097   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7098 #endif /* KMP_CACHE_MANAGE */
7099   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7100   KMP_DEBUG_ASSERT(dispatch);
7101   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7102   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7103   // this_thr->th.th_info.ds.ds_tid ] );
7104 
7105   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7106   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7107   if (__kmp_env_consistency_check)
7108     __kmp_push_parallel(gtid, team->t.t_ident);
7109 
7110   KMP_MB(); /* Flush all pending memory write invalidates.  */
7111 }
7112 
7113 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7114                                   kmp_team_t *team) {
7115   if (__kmp_env_consistency_check)
7116     __kmp_pop_parallel(gtid, team->t.t_ident);
7117 
7118   __kmp_finish_implicit_task(this_thr);
7119 }
7120 
7121 int __kmp_invoke_task_func(int gtid) {
7122   int rc;
7123   int tid = __kmp_tid_from_gtid(gtid);
7124   kmp_info_t *this_thr = __kmp_threads[gtid];
7125   kmp_team_t *team = this_thr->th.th_team;
7126 
7127   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7128 #if USE_ITT_BUILD
7129   if (__itt_stack_caller_create_ptr) {
7130     __kmp_itt_stack_callee_enter(
7131         (__itt_caller)
7132             team->t.t_stack_id); // inform ittnotify about entering user's code
7133   }
7134 #endif /* USE_ITT_BUILD */
7135 #if INCLUDE_SSC_MARKS
7136   SSC_MARK_INVOKING();
7137 #endif
7138 
7139 #if OMPT_SUPPORT
7140   void *dummy;
7141   void **exit_frame_p;
7142   ompt_data_t *my_task_data;
7143   ompt_data_t *my_parallel_data;
7144   int ompt_team_size;
7145 
7146   if (ompt_enabled.enabled) {
7147     exit_frame_p = &(
7148         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7149   } else {
7150     exit_frame_p = &dummy;
7151   }
7152 
7153   my_task_data =
7154       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7155   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7156   if (ompt_enabled.ompt_callback_implicit_task) {
7157     ompt_team_size = team->t.t_nproc;
7158     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7159         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7160         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7161     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7162   }
7163 #endif
7164 
7165 #if KMP_STATS_ENABLED
7166   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7167   if (previous_state == stats_state_e::TEAMS_REGION) {
7168     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7169   } else {
7170     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7171   }
7172   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7173 #endif
7174 
7175   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7176                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7177 #if OMPT_SUPPORT
7178                               ,
7179                               exit_frame_p
7180 #endif
7181                               );
7182 #if OMPT_SUPPORT
7183   *exit_frame_p = NULL;
7184    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7185 #endif
7186 
7187 #if KMP_STATS_ENABLED
7188   if (previous_state == stats_state_e::TEAMS_REGION) {
7189     KMP_SET_THREAD_STATE(previous_state);
7190   }
7191   KMP_POP_PARTITIONED_TIMER();
7192 #endif
7193 
7194 #if USE_ITT_BUILD
7195   if (__itt_stack_caller_create_ptr) {
7196     __kmp_itt_stack_callee_leave(
7197         (__itt_caller)
7198             team->t.t_stack_id); // inform ittnotify about leaving user's code
7199   }
7200 #endif /* USE_ITT_BUILD */
7201   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7202 
7203   return rc;
7204 }
7205 
7206 void __kmp_teams_master(int gtid) {
7207   // This routine is called by all master threads in teams construct
7208   kmp_info_t *thr = __kmp_threads[gtid];
7209   kmp_team_t *team = thr->th.th_team;
7210   ident_t *loc = team->t.t_ident;
7211   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7212   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7213   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7214   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7215                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7216 
7217   // This thread is a new CG root.  Set up the proper variables.
7218   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7219   tmp->cg_root = thr; // Make thr the CG root
7220   // Init to thread limit that was stored when league masters were forked
7221   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7222   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7223   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7224                  " cg_nthreads to 1\n",
7225                  thr, tmp));
7226   tmp->up = thr->th.th_cg_roots;
7227   thr->th.th_cg_roots = tmp;
7228 
7229 // Launch league of teams now, but not let workers execute
7230 // (they hang on fork barrier until next parallel)
7231 #if INCLUDE_SSC_MARKS
7232   SSC_MARK_FORKING();
7233 #endif
7234   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7235                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7236                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7237 #if INCLUDE_SSC_MARKS
7238   SSC_MARK_JOINING();
7239 #endif
7240   // If the team size was reduced from the limit, set it to the new size
7241   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7242     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7243   // AC: last parameter "1" eliminates join barrier which won't work because
7244   // worker threads are in a fork barrier waiting for more parallel regions
7245   __kmp_join_call(loc, gtid
7246 #if OMPT_SUPPORT
7247                   ,
7248                   fork_context_intel
7249 #endif
7250                   ,
7251                   1);
7252 }
7253 
7254 int __kmp_invoke_teams_master(int gtid) {
7255   kmp_info_t *this_thr = __kmp_threads[gtid];
7256   kmp_team_t *team = this_thr->th.th_team;
7257 #if KMP_DEBUG
7258   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7259     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7260                      (void *)__kmp_teams_master);
7261 #endif
7262   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7263 #if OMPT_SUPPORT
7264   int tid = __kmp_tid_from_gtid(gtid);
7265   ompt_data_t *task_data =
7266       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7267   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7268   if (ompt_enabled.ompt_callback_implicit_task) {
7269     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7270         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7271         ompt_task_initial);
7272     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7273   }
7274 #endif
7275   __kmp_teams_master(gtid);
7276 #if OMPT_SUPPORT
7277   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7278 #endif
7279   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7280   return 1;
7281 }
7282 
7283 /* this sets the requested number of threads for the next parallel region
7284    encountered by this team. since this should be enclosed in the forkjoin
7285    critical section it should avoid race conditions with asymmetrical nested
7286    parallelism */
7287 
7288 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7289   kmp_info_t *thr = __kmp_threads[gtid];
7290 
7291   if (num_threads > 0)
7292     thr->th.th_set_nproc = num_threads;
7293 }
7294 
7295 /* this sets the requested number of teams for the teams region and/or
7296    the number of threads for the next parallel region encountered  */
7297 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7298                           int num_threads) {
7299   kmp_info_t *thr = __kmp_threads[gtid];
7300   KMP_DEBUG_ASSERT(num_teams >= 0);
7301   KMP_DEBUG_ASSERT(num_threads >= 0);
7302 
7303   if (num_teams == 0)
7304     num_teams = 1; // default number of teams is 1.
7305   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7306     if (!__kmp_reserve_warn) {
7307       __kmp_reserve_warn = 1;
7308       __kmp_msg(kmp_ms_warning,
7309                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7310                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7311     }
7312     num_teams = __kmp_teams_max_nth;
7313   }
7314   // Set number of teams (number of threads in the outer "parallel" of the
7315   // teams)
7316   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7317 
7318   // Remember the number of threads for inner parallel regions
7319   if (!TCR_4(__kmp_init_middle))
7320     __kmp_middle_initialize(); // get internal globals calculated
7321   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7322   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7323   if (num_threads == 0) {
7324     num_threads = __kmp_avail_proc / num_teams;
7325     // adjust num_threads w/o warning as it is not user setting
7326     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7327     // no thread_limit clause specified -  do not change thread-limit-var ICV
7328     if (num_threads > __kmp_dflt_team_nth) {
7329       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7330     }
7331     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7332       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7333     } // prevent team size to exceed thread-limit-var
7334     if (num_teams * num_threads > __kmp_teams_max_nth) {
7335       num_threads = __kmp_teams_max_nth / num_teams;
7336     }
7337   } else {
7338     // This thread will be the master of the league masters
7339     // Store new thread limit; old limit is saved in th_cg_roots list
7340     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7341     // num_threads = min(num_threads, nthreads-var)
7342     if (num_threads > __kmp_dflt_team_nth) {
7343       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7344     }
7345     if (num_teams * num_threads > __kmp_teams_max_nth) {
7346       int new_threads = __kmp_teams_max_nth / num_teams;
7347       if (!__kmp_reserve_warn) { // user asked for too many threads
7348         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7349         __kmp_msg(kmp_ms_warning,
7350                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7351                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7352       }
7353       num_threads = new_threads;
7354     }
7355   }
7356   thr->th.th_teams_size.nth = num_threads;
7357 }
7358 
7359 // Set the proc_bind var to use in the following parallel region.
7360 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7361   kmp_info_t *thr = __kmp_threads[gtid];
7362   thr->th.th_set_proc_bind = proc_bind;
7363 }
7364 
7365 /* Launch the worker threads into the microtask. */
7366 
7367 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7368   kmp_info_t *this_thr = __kmp_threads[gtid];
7369 
7370 #ifdef KMP_DEBUG
7371   int f;
7372 #endif /* KMP_DEBUG */
7373 
7374   KMP_DEBUG_ASSERT(team);
7375   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7376   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7377   KMP_MB(); /* Flush all pending memory write invalidates.  */
7378 
7379   team->t.t_construct = 0; /* no single directives seen yet */
7380   team->t.t_ordered.dt.t_value =
7381       0; /* thread 0 enters the ordered section first */
7382 
7383   /* Reset the identifiers on the dispatch buffer */
7384   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7385   if (team->t.t_max_nproc > 1) {
7386     int i;
7387     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7388       team->t.t_disp_buffer[i].buffer_index = i;
7389       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7390     }
7391   } else {
7392     team->t.t_disp_buffer[0].buffer_index = 0;
7393     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7394   }
7395 
7396   KMP_MB(); /* Flush all pending memory write invalidates.  */
7397   KMP_ASSERT(this_thr->th.th_team == team);
7398 
7399 #ifdef KMP_DEBUG
7400   for (f = 0; f < team->t.t_nproc; f++) {
7401     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7402                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7403   }
7404 #endif /* KMP_DEBUG */
7405 
7406   /* release the worker threads so they may begin working */
7407   __kmp_fork_barrier(gtid, 0);
7408 }
7409 
7410 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7411   kmp_info_t *this_thr = __kmp_threads[gtid];
7412 
7413   KMP_DEBUG_ASSERT(team);
7414   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7415   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7416   KMP_MB(); /* Flush all pending memory write invalidates.  */
7417 
7418 /* Join barrier after fork */
7419 
7420 #ifdef KMP_DEBUG
7421   if (__kmp_threads[gtid] &&
7422       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7423     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7424                  __kmp_threads[gtid]);
7425     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7426                  "team->t.t_nproc=%d\n",
7427                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7428                  team->t.t_nproc);
7429     __kmp_print_structure();
7430   }
7431   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7432                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7433 #endif /* KMP_DEBUG */
7434 
7435   __kmp_join_barrier(gtid); /* wait for everyone */
7436 #if OMPT_SUPPORT
7437   if (ompt_enabled.enabled &&
7438       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7439     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7440     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7441     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7442 #if OMPT_OPTIONAL
7443     void *codeptr = NULL;
7444     if (KMP_MASTER_TID(ds_tid) &&
7445         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7446          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7447       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7448 
7449     if (ompt_enabled.ompt_callback_sync_region_wait) {
7450       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7451           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7452           codeptr);
7453     }
7454     if (ompt_enabled.ompt_callback_sync_region) {
7455       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7456           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7457           codeptr);
7458     }
7459 #endif
7460     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7461       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7462           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7463     }
7464   }
7465 #endif
7466 
7467   KMP_MB(); /* Flush all pending memory write invalidates.  */
7468   KMP_ASSERT(this_thr->th.th_team == team);
7469 }
7470 
7471 /* ------------------------------------------------------------------------ */
7472 
7473 #ifdef USE_LOAD_BALANCE
7474 
7475 // Return the worker threads actively spinning in the hot team, if we
7476 // are at the outermost level of parallelism.  Otherwise, return 0.
7477 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7478   int i;
7479   int retval;
7480   kmp_team_t *hot_team;
7481 
7482   if (root->r.r_active) {
7483     return 0;
7484   }
7485   hot_team = root->r.r_hot_team;
7486   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7487     return hot_team->t.t_nproc - 1; // Don't count master thread
7488   }
7489 
7490   // Skip the master thread - it is accounted for elsewhere.
7491   retval = 0;
7492   for (i = 1; i < hot_team->t.t_nproc; i++) {
7493     if (hot_team->t.t_threads[i]->th.th_active) {
7494       retval++;
7495     }
7496   }
7497   return retval;
7498 }
7499 
7500 // Perform an automatic adjustment to the number of
7501 // threads used by the next parallel region.
7502 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7503   int retval;
7504   int pool_active;
7505   int hot_team_active;
7506   int team_curr_active;
7507   int system_active;
7508 
7509   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7510                 set_nproc));
7511   KMP_DEBUG_ASSERT(root);
7512   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7513                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7514   KMP_DEBUG_ASSERT(set_nproc > 1);
7515 
7516   if (set_nproc == 1) {
7517     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7518     return 1;
7519   }
7520 
7521   // Threads that are active in the thread pool, active in the hot team for this
7522   // particular root (if we are at the outer par level), and the currently
7523   // executing thread (to become the master) are available to add to the new
7524   // team, but are currently contributing to the system load, and must be
7525   // accounted for.
7526   pool_active = __kmp_thread_pool_active_nth;
7527   hot_team_active = __kmp_active_hot_team_nproc(root);
7528   team_curr_active = pool_active + hot_team_active + 1;
7529 
7530   // Check the system load.
7531   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7532   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7533                 "hot team active = %d\n",
7534                 system_active, pool_active, hot_team_active));
7535 
7536   if (system_active < 0) {
7537     // There was an error reading the necessary info from /proc, so use the
7538     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7539     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7540     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7541     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7542 
7543     // Make this call behave like the thread limit algorithm.
7544     retval = __kmp_avail_proc - __kmp_nth +
7545              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7546     if (retval > set_nproc) {
7547       retval = set_nproc;
7548     }
7549     if (retval < KMP_MIN_NTH) {
7550       retval = KMP_MIN_NTH;
7551     }
7552 
7553     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7554                   retval));
7555     return retval;
7556   }
7557 
7558   // There is a slight delay in the load balance algorithm in detecting new
7559   // running procs. The real system load at this instant should be at least as
7560   // large as the #active omp thread that are available to add to the team.
7561   if (system_active < team_curr_active) {
7562     system_active = team_curr_active;
7563   }
7564   retval = __kmp_avail_proc - system_active + team_curr_active;
7565   if (retval > set_nproc) {
7566     retval = set_nproc;
7567   }
7568   if (retval < KMP_MIN_NTH) {
7569     retval = KMP_MIN_NTH;
7570   }
7571 
7572   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7573   return retval;
7574 } // __kmp_load_balance_nproc()
7575 
7576 #endif /* USE_LOAD_BALANCE */
7577 
7578 /* ------------------------------------------------------------------------ */
7579 
7580 /* NOTE: this is called with the __kmp_init_lock held */
7581 void __kmp_cleanup(void) {
7582   int f;
7583 
7584   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7585 
7586   if (TCR_4(__kmp_init_parallel)) {
7587 #if KMP_HANDLE_SIGNALS
7588     __kmp_remove_signals();
7589 #endif
7590     TCW_4(__kmp_init_parallel, FALSE);
7591   }
7592 
7593   if (TCR_4(__kmp_init_middle)) {
7594 #if KMP_AFFINITY_SUPPORTED
7595     __kmp_affinity_uninitialize();
7596 #endif /* KMP_AFFINITY_SUPPORTED */
7597     __kmp_cleanup_hierarchy();
7598     TCW_4(__kmp_init_middle, FALSE);
7599   }
7600 
7601   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7602 
7603   if (__kmp_init_serial) {
7604     __kmp_runtime_destroy();
7605     __kmp_init_serial = FALSE;
7606   }
7607 
7608   __kmp_cleanup_threadprivate_caches();
7609 
7610   for (f = 0; f < __kmp_threads_capacity; f++) {
7611     if (__kmp_root[f] != NULL) {
7612       __kmp_free(__kmp_root[f]);
7613       __kmp_root[f] = NULL;
7614     }
7615   }
7616   __kmp_free(__kmp_threads);
7617   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7618   // there is no need in freeing __kmp_root.
7619   __kmp_threads = NULL;
7620   __kmp_root = NULL;
7621   __kmp_threads_capacity = 0;
7622 
7623 #if KMP_USE_DYNAMIC_LOCK
7624   __kmp_cleanup_indirect_user_locks();
7625 #else
7626   __kmp_cleanup_user_locks();
7627 #endif
7628 
7629 #if KMP_AFFINITY_SUPPORTED
7630   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7631   __kmp_cpuinfo_file = NULL;
7632 #endif /* KMP_AFFINITY_SUPPORTED */
7633 
7634 #if KMP_USE_ADAPTIVE_LOCKS
7635 #if KMP_DEBUG_ADAPTIVE_LOCKS
7636   __kmp_print_speculative_stats();
7637 #endif
7638 #endif
7639   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7640   __kmp_nested_nth.nth = NULL;
7641   __kmp_nested_nth.size = 0;
7642   __kmp_nested_nth.used = 0;
7643   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7644   __kmp_nested_proc_bind.bind_types = NULL;
7645   __kmp_nested_proc_bind.size = 0;
7646   __kmp_nested_proc_bind.used = 0;
7647   if (__kmp_affinity_format) {
7648     KMP_INTERNAL_FREE(__kmp_affinity_format);
7649     __kmp_affinity_format = NULL;
7650   }
7651 
7652   __kmp_i18n_catclose();
7653 
7654 #if KMP_USE_HIER_SCHED
7655   __kmp_hier_scheds.deallocate();
7656 #endif
7657 
7658 #if KMP_STATS_ENABLED
7659   __kmp_stats_fini();
7660 #endif
7661 
7662   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7663 }
7664 
7665 /* ------------------------------------------------------------------------ */
7666 
7667 int __kmp_ignore_mppbeg(void) {
7668   char *env;
7669 
7670   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7671     if (__kmp_str_match_false(env))
7672       return FALSE;
7673   }
7674   // By default __kmpc_begin() is no-op.
7675   return TRUE;
7676 }
7677 
7678 int __kmp_ignore_mppend(void) {
7679   char *env;
7680 
7681   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7682     if (__kmp_str_match_false(env))
7683       return FALSE;
7684   }
7685   // By default __kmpc_end() is no-op.
7686   return TRUE;
7687 }
7688 
7689 void __kmp_internal_begin(void) {
7690   int gtid;
7691   kmp_root_t *root;
7692 
7693   /* this is a very important step as it will register new sibling threads
7694      and assign these new uber threads a new gtid */
7695   gtid = __kmp_entry_gtid();
7696   root = __kmp_threads[gtid]->th.th_root;
7697   KMP_ASSERT(KMP_UBER_GTID(gtid));
7698 
7699   if (root->r.r_begin)
7700     return;
7701   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7702   if (root->r.r_begin) {
7703     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7704     return;
7705   }
7706 
7707   root->r.r_begin = TRUE;
7708 
7709   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7710 }
7711 
7712 /* ------------------------------------------------------------------------ */
7713 
7714 void __kmp_user_set_library(enum library_type arg) {
7715   int gtid;
7716   kmp_root_t *root;
7717   kmp_info_t *thread;
7718 
7719   /* first, make sure we are initialized so we can get our gtid */
7720 
7721   gtid = __kmp_entry_gtid();
7722   thread = __kmp_threads[gtid];
7723 
7724   root = thread->th.th_root;
7725 
7726   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7727                 library_serial));
7728   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7729                                   thread */
7730     KMP_WARNING(SetLibraryIncorrectCall);
7731     return;
7732   }
7733 
7734   switch (arg) {
7735   case library_serial:
7736     thread->th.th_set_nproc = 0;
7737     set__nproc(thread, 1);
7738     break;
7739   case library_turnaround:
7740     thread->th.th_set_nproc = 0;
7741     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7742                                            : __kmp_dflt_team_nth_ub);
7743     break;
7744   case library_throughput:
7745     thread->th.th_set_nproc = 0;
7746     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7747                                            : __kmp_dflt_team_nth_ub);
7748     break;
7749   default:
7750     KMP_FATAL(UnknownLibraryType, arg);
7751   }
7752 
7753   __kmp_aux_set_library(arg);
7754 }
7755 
7756 void __kmp_aux_set_stacksize(size_t arg) {
7757   if (!__kmp_init_serial)
7758     __kmp_serial_initialize();
7759 
7760 #if KMP_OS_DARWIN
7761   if (arg & (0x1000 - 1)) {
7762     arg &= ~(0x1000 - 1);
7763     if (arg + 0x1000) /* check for overflow if we round up */
7764       arg += 0x1000;
7765   }
7766 #endif
7767   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7768 
7769   /* only change the default stacksize before the first parallel region */
7770   if (!TCR_4(__kmp_init_parallel)) {
7771     size_t value = arg; /* argument is in bytes */
7772 
7773     if (value < __kmp_sys_min_stksize)
7774       value = __kmp_sys_min_stksize;
7775     else if (value > KMP_MAX_STKSIZE)
7776       value = KMP_MAX_STKSIZE;
7777 
7778     __kmp_stksize = value;
7779 
7780     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7781   }
7782 
7783   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7784 }
7785 
7786 /* set the behaviour of the runtime library */
7787 /* TODO this can cause some odd behaviour with sibling parallelism... */
7788 void __kmp_aux_set_library(enum library_type arg) {
7789   __kmp_library = arg;
7790 
7791   switch (__kmp_library) {
7792   case library_serial: {
7793     KMP_INFORM(LibraryIsSerial);
7794   } break;
7795   case library_turnaround:
7796     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7797       __kmp_use_yield = 2; // only yield when oversubscribed
7798     break;
7799   case library_throughput:
7800     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7801       __kmp_dflt_blocktime = 200;
7802     break;
7803   default:
7804     KMP_FATAL(UnknownLibraryType, arg);
7805   }
7806 }
7807 
7808 /* Getting team information common for all team API */
7809 // Returns NULL if not in teams construct
7810 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7811   kmp_info_t *thr = __kmp_entry_thread();
7812   teams_serialized = 0;
7813   if (thr->th.th_teams_microtask) {
7814     kmp_team_t *team = thr->th.th_team;
7815     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7816     int ii = team->t.t_level;
7817     teams_serialized = team->t.t_serialized;
7818     int level = tlevel + 1;
7819     KMP_DEBUG_ASSERT(ii >= tlevel);
7820     while (ii > level) {
7821       for (teams_serialized = team->t.t_serialized;
7822            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7823       }
7824       if (team->t.t_serialized && (!teams_serialized)) {
7825         team = team->t.t_parent;
7826         continue;
7827       }
7828       if (ii > level) {
7829         team = team->t.t_parent;
7830         ii--;
7831       }
7832     }
7833     return team;
7834   }
7835   return NULL;
7836 }
7837 
7838 int __kmp_aux_get_team_num() {
7839   int serialized;
7840   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7841   if (team) {
7842     if (serialized > 1) {
7843       return 0; // teams region is serialized ( 1 team of 1 thread ).
7844     } else {
7845       return team->t.t_master_tid;
7846     }
7847   }
7848   return 0;
7849 }
7850 
7851 int __kmp_aux_get_num_teams() {
7852   int serialized;
7853   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7854   if (team) {
7855     if (serialized > 1) {
7856       return 1;
7857     } else {
7858       return team->t.t_parent->t.t_nproc;
7859     }
7860   }
7861   return 1;
7862 }
7863 
7864 /* ------------------------------------------------------------------------ */
7865 
7866 /*
7867  * Affinity Format Parser
7868  *
7869  * Field is in form of: %[[[0].]size]type
7870  * % and type are required (%% means print a literal '%')
7871  * type is either single char or long name surrounded by {},
7872  * e.g., N or {num_threads}
7873  * 0 => leading zeros
7874  * . => right justified when size is specified
7875  * by default output is left justified
7876  * size is the *minimum* field length
7877  * All other characters are printed as is
7878  *
7879  * Available field types:
7880  * L {thread_level}      - omp_get_level()
7881  * n {thread_num}        - omp_get_thread_num()
7882  * h {host}              - name of host machine
7883  * P {process_id}        - process id (integer)
7884  * T {thread_identifier} - native thread identifier (integer)
7885  * N {num_threads}       - omp_get_num_threads()
7886  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7887  * a {thread_affinity}   - comma separated list of integers or integer ranges
7888  *                         (values of affinity mask)
7889  *
7890  * Implementation-specific field types can be added
7891  * If a type is unknown, print "undefined"
7892 */
7893 
7894 // Structure holding the short name, long name, and corresponding data type
7895 // for snprintf.  A table of these will represent the entire valid keyword
7896 // field types.
7897 typedef struct kmp_affinity_format_field_t {
7898   char short_name; // from spec e.g., L -> thread level
7899   const char *long_name; // from spec thread_level -> thread level
7900   char field_format; // data type for snprintf (typically 'd' or 's'
7901   // for integer or string)
7902 } kmp_affinity_format_field_t;
7903 
7904 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7905 #if KMP_AFFINITY_SUPPORTED
7906     {'A', "thread_affinity", 's'},
7907 #endif
7908     {'t', "team_num", 'd'},
7909     {'T', "num_teams", 'd'},
7910     {'L', "nesting_level", 'd'},
7911     {'n', "thread_num", 'd'},
7912     {'N', "num_threads", 'd'},
7913     {'a', "ancestor_tnum", 'd'},
7914     {'H', "host", 's'},
7915     {'P', "process_id", 'd'},
7916     {'i', "native_thread_id", 'd'}};
7917 
7918 // Return the number of characters it takes to hold field
7919 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7920                                             const char **ptr,
7921                                             kmp_str_buf_t *field_buffer) {
7922   int rc, format_index, field_value;
7923   const char *width_left, *width_right;
7924   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7925   static const int FORMAT_SIZE = 20;
7926   char format[FORMAT_SIZE] = {0};
7927   char absolute_short_name = 0;
7928 
7929   KMP_DEBUG_ASSERT(gtid >= 0);
7930   KMP_DEBUG_ASSERT(th);
7931   KMP_DEBUG_ASSERT(**ptr == '%');
7932   KMP_DEBUG_ASSERT(field_buffer);
7933 
7934   __kmp_str_buf_clear(field_buffer);
7935 
7936   // Skip the initial %
7937   (*ptr)++;
7938 
7939   // Check for %% first
7940   if (**ptr == '%') {
7941     __kmp_str_buf_cat(field_buffer, "%", 1);
7942     (*ptr)++; // skip over the second %
7943     return 1;
7944   }
7945 
7946   // Parse field modifiers if they are present
7947   pad_zeros = false;
7948   if (**ptr == '0') {
7949     pad_zeros = true;
7950     (*ptr)++; // skip over 0
7951   }
7952   right_justify = false;
7953   if (**ptr == '.') {
7954     right_justify = true;
7955     (*ptr)++; // skip over .
7956   }
7957   // Parse width of field: [width_left, width_right)
7958   width_left = width_right = NULL;
7959   if (**ptr >= '0' && **ptr <= '9') {
7960     width_left = *ptr;
7961     SKIP_DIGITS(*ptr);
7962     width_right = *ptr;
7963   }
7964 
7965   // Create the format for KMP_SNPRINTF based on flags parsed above
7966   format_index = 0;
7967   format[format_index++] = '%';
7968   if (!right_justify)
7969     format[format_index++] = '-';
7970   if (pad_zeros)
7971     format[format_index++] = '0';
7972   if (width_left && width_right) {
7973     int i = 0;
7974     // Only allow 8 digit number widths.
7975     // This also prevents overflowing format variable
7976     while (i < 8 && width_left < width_right) {
7977       format[format_index++] = *width_left;
7978       width_left++;
7979       i++;
7980     }
7981   }
7982 
7983   // Parse a name (long or short)
7984   // Canonicalize the name into absolute_short_name
7985   found_valid_name = false;
7986   parse_long_name = (**ptr == '{');
7987   if (parse_long_name)
7988     (*ptr)++; // skip initial left brace
7989   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7990                              sizeof(__kmp_affinity_format_table[0]);
7991        ++i) {
7992     char short_name = __kmp_affinity_format_table[i].short_name;
7993     const char *long_name = __kmp_affinity_format_table[i].long_name;
7994     char field_format = __kmp_affinity_format_table[i].field_format;
7995     if (parse_long_name) {
7996       int length = KMP_STRLEN(long_name);
7997       if (strncmp(*ptr, long_name, length) == 0) {
7998         found_valid_name = true;
7999         (*ptr) += length; // skip the long name
8000       }
8001     } else if (**ptr == short_name) {
8002       found_valid_name = true;
8003       (*ptr)++; // skip the short name
8004     }
8005     if (found_valid_name) {
8006       format[format_index++] = field_format;
8007       format[format_index++] = '\0';
8008       absolute_short_name = short_name;
8009       break;
8010     }
8011   }
8012   if (parse_long_name) {
8013     if (**ptr != '}') {
8014       absolute_short_name = 0;
8015     } else {
8016       (*ptr)++; // skip over the right brace
8017     }
8018   }
8019 
8020   // Attempt to fill the buffer with the requested
8021   // value using snprintf within __kmp_str_buf_print()
8022   switch (absolute_short_name) {
8023   case 't':
8024     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8025     break;
8026   case 'T':
8027     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8028     break;
8029   case 'L':
8030     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8031     break;
8032   case 'n':
8033     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8034     break;
8035   case 'H': {
8036     static const int BUFFER_SIZE = 256;
8037     char buf[BUFFER_SIZE];
8038     __kmp_expand_host_name(buf, BUFFER_SIZE);
8039     rc = __kmp_str_buf_print(field_buffer, format, buf);
8040   } break;
8041   case 'P':
8042     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8043     break;
8044   case 'i':
8045     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8046     break;
8047   case 'N':
8048     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8049     break;
8050   case 'a':
8051     field_value =
8052         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8053     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8054     break;
8055 #if KMP_AFFINITY_SUPPORTED
8056   case 'A': {
8057     kmp_str_buf_t buf;
8058     __kmp_str_buf_init(&buf);
8059     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8060     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8061     __kmp_str_buf_free(&buf);
8062   } break;
8063 #endif
8064   default:
8065     // According to spec, If an implementation does not have info for field
8066     // type, then "undefined" is printed
8067     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8068     // Skip the field
8069     if (parse_long_name) {
8070       SKIP_TOKEN(*ptr);
8071       if (**ptr == '}')
8072         (*ptr)++;
8073     } else {
8074       (*ptr)++;
8075     }
8076   }
8077 
8078   KMP_ASSERT(format_index <= FORMAT_SIZE);
8079   return rc;
8080 }
8081 
8082 /*
8083  * Return number of characters needed to hold the affinity string
8084  * (not including null byte character)
8085  * The resultant string is printed to buffer, which the caller can then
8086  * handle afterwards
8087 */
8088 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8089                                   kmp_str_buf_t *buffer) {
8090   const char *parse_ptr;
8091   size_t retval;
8092   const kmp_info_t *th;
8093   kmp_str_buf_t field;
8094 
8095   KMP_DEBUG_ASSERT(buffer);
8096   KMP_DEBUG_ASSERT(gtid >= 0);
8097 
8098   __kmp_str_buf_init(&field);
8099   __kmp_str_buf_clear(buffer);
8100 
8101   th = __kmp_threads[gtid];
8102   retval = 0;
8103 
8104   // If format is NULL or zero-length string, then we use
8105   // affinity-format-var ICV
8106   parse_ptr = format;
8107   if (parse_ptr == NULL || *parse_ptr == '\0') {
8108     parse_ptr = __kmp_affinity_format;
8109   }
8110   KMP_DEBUG_ASSERT(parse_ptr);
8111 
8112   while (*parse_ptr != '\0') {
8113     // Parse a field
8114     if (*parse_ptr == '%') {
8115       // Put field in the buffer
8116       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8117       __kmp_str_buf_catbuf(buffer, &field);
8118       retval += rc;
8119     } else {
8120       // Put literal character in buffer
8121       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8122       retval++;
8123       parse_ptr++;
8124     }
8125   }
8126   __kmp_str_buf_free(&field);
8127   return retval;
8128 }
8129 
8130 // Displays the affinity string to stdout
8131 void __kmp_aux_display_affinity(int gtid, const char *format) {
8132   kmp_str_buf_t buf;
8133   __kmp_str_buf_init(&buf);
8134   __kmp_aux_capture_affinity(gtid, format, &buf);
8135   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8136   __kmp_str_buf_free(&buf);
8137 }
8138 
8139 /* ------------------------------------------------------------------------ */
8140 
8141 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8142   int blocktime = arg; /* argument is in milliseconds */
8143 #if KMP_USE_MONITOR
8144   int bt_intervals;
8145 #endif
8146   int bt_set;
8147 
8148   __kmp_save_internal_controls(thread);
8149 
8150   /* Normalize and set blocktime for the teams */
8151   if (blocktime < KMP_MIN_BLOCKTIME)
8152     blocktime = KMP_MIN_BLOCKTIME;
8153   else if (blocktime > KMP_MAX_BLOCKTIME)
8154     blocktime = KMP_MAX_BLOCKTIME;
8155 
8156   set__blocktime_team(thread->th.th_team, tid, blocktime);
8157   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8158 
8159 #if KMP_USE_MONITOR
8160   /* Calculate and set blocktime intervals for the teams */
8161   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8162 
8163   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8164   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8165 #endif
8166 
8167   /* Set whether blocktime has been set to "TRUE" */
8168   bt_set = TRUE;
8169 
8170   set__bt_set_team(thread->th.th_team, tid, bt_set);
8171   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8172 #if KMP_USE_MONITOR
8173   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8174                 "bt_intervals=%d, monitor_updates=%d\n",
8175                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8176                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8177                 __kmp_monitor_wakeups));
8178 #else
8179   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8180                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8181                 thread->th.th_team->t.t_id, tid, blocktime));
8182 #endif
8183 }
8184 
8185 void __kmp_aux_set_defaults(char const *str, int len) {
8186   if (!__kmp_init_serial) {
8187     __kmp_serial_initialize();
8188   }
8189   __kmp_env_initialize(str);
8190 
8191   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8192     __kmp_env_print();
8193   }
8194 } // __kmp_aux_set_defaults
8195 
8196 /* ------------------------------------------------------------------------ */
8197 /* internal fast reduction routines */
8198 
8199 PACKED_REDUCTION_METHOD_T
8200 __kmp_determine_reduction_method(
8201     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8202     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8203     kmp_critical_name *lck) {
8204 
8205   // Default reduction method: critical construct ( lck != NULL, like in current
8206   // PAROPT )
8207   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8208   // can be selected by RTL
8209   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8210   // can be selected by RTL
8211   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8212   // among generated by PAROPT.
8213 
8214   PACKED_REDUCTION_METHOD_T retval;
8215 
8216   int team_size;
8217 
8218   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8219   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8220 
8221 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8222   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8223 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8224 
8225   retval = critical_reduce_block;
8226 
8227   // another choice of getting a team size (with 1 dynamic deference) is slower
8228   team_size = __kmp_get_team_num_threads(global_tid);
8229   if (team_size == 1) {
8230 
8231     retval = empty_reduce_block;
8232 
8233   } else {
8234 
8235     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8236 
8237 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8238     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8239 
8240 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8241     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8242 
8243     int teamsize_cutoff = 4;
8244 
8245 #if KMP_MIC_SUPPORTED
8246     if (__kmp_mic_type != non_mic) {
8247       teamsize_cutoff = 8;
8248     }
8249 #endif
8250     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8251     if (tree_available) {
8252       if (team_size <= teamsize_cutoff) {
8253         if (atomic_available) {
8254           retval = atomic_reduce_block;
8255         }
8256       } else {
8257         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8258       }
8259     } else if (atomic_available) {
8260       retval = atomic_reduce_block;
8261     }
8262 #else
8263 #error "Unknown or unsupported OS"
8264 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8265        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8266 
8267 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8268 
8269 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8270 
8271     // basic tuning
8272 
8273     if (atomic_available) {
8274       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8275         retval = atomic_reduce_block;
8276       }
8277     } // otherwise: use critical section
8278 
8279 #elif KMP_OS_DARWIN
8280 
8281     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8282     if (atomic_available && (num_vars <= 3)) {
8283       retval = atomic_reduce_block;
8284     } else if (tree_available) {
8285       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8286           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8287         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8288       }
8289     } // otherwise: use critical section
8290 
8291 #else
8292 #error "Unknown or unsupported OS"
8293 #endif
8294 
8295 #else
8296 #error "Unknown or unsupported architecture"
8297 #endif
8298   }
8299 
8300   // KMP_FORCE_REDUCTION
8301 
8302   // If the team is serialized (team_size == 1), ignore the forced reduction
8303   // method and stay with the unsynchronized method (empty_reduce_block)
8304   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8305       team_size != 1) {
8306 
8307     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8308 
8309     int atomic_available, tree_available;
8310 
8311     switch ((forced_retval = __kmp_force_reduction_method)) {
8312     case critical_reduce_block:
8313       KMP_ASSERT(lck); // lck should be != 0
8314       break;
8315 
8316     case atomic_reduce_block:
8317       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8318       if (!atomic_available) {
8319         KMP_WARNING(RedMethodNotSupported, "atomic");
8320         forced_retval = critical_reduce_block;
8321       }
8322       break;
8323 
8324     case tree_reduce_block:
8325       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8326       if (!tree_available) {
8327         KMP_WARNING(RedMethodNotSupported, "tree");
8328         forced_retval = critical_reduce_block;
8329       } else {
8330 #if KMP_FAST_REDUCTION_BARRIER
8331         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8332 #endif
8333       }
8334       break;
8335 
8336     default:
8337       KMP_ASSERT(0); // "unsupported method specified"
8338     }
8339 
8340     retval = forced_retval;
8341   }
8342 
8343   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8344 
8345 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8346 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8347 
8348   return (retval);
8349 }
8350 // this function is for testing set/get/determine reduce method
8351 kmp_int32 __kmp_get_reduce_method(void) {
8352   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8353 }
8354 
8355 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8356 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8357 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8358 
8359 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8360 // OpenMP is used subsequently.
8361 void __kmp_hard_pause() {
8362   __kmp_pause_status = kmp_hard_paused;
8363   __kmp_internal_end_thread(-1);
8364 }
8365 
8366 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8367 void __kmp_resume_if_soft_paused() {
8368   if (__kmp_pause_status == kmp_soft_paused) {
8369     __kmp_pause_status = kmp_not_paused;
8370 
8371     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8372       kmp_info_t *thread = __kmp_threads[gtid];
8373       if (thread) { // Wake it if sleeping
8374         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8375         if (fl.is_sleeping())
8376           fl.resume(gtid);
8377         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8378           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8379         } else { // thread holds the lock and may sleep soon
8380           do { // until either the thread sleeps, or we can get the lock
8381             if (fl.is_sleeping()) {
8382               fl.resume(gtid);
8383               break;
8384             } else if (__kmp_try_suspend_mx(thread)) {
8385               __kmp_unlock_suspend_mx(thread);
8386               break;
8387             }
8388           } while (1);
8389         }
8390       }
8391     }
8392   }
8393 }
8394 
8395 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8396 // TODO: add warning messages
8397 int __kmp_pause_resource(kmp_pause_status_t level) {
8398   if (level == kmp_not_paused) { // requesting resume
8399     if (__kmp_pause_status == kmp_not_paused) {
8400       // error message about runtime not being paused, so can't resume
8401       return 1;
8402     } else {
8403       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8404                        __kmp_pause_status == kmp_hard_paused);
8405       __kmp_pause_status = kmp_not_paused;
8406       return 0;
8407     }
8408   } else if (level == kmp_soft_paused) { // requesting soft pause
8409     if (__kmp_pause_status != kmp_not_paused) {
8410       // error message about already being paused
8411       return 1;
8412     } else {
8413       __kmp_soft_pause();
8414       return 0;
8415     }
8416   } else if (level == kmp_hard_paused) { // requesting hard pause
8417     if (__kmp_pause_status != kmp_not_paused) {
8418       // error message about already being paused
8419       return 1;
8420     } else {
8421       __kmp_hard_pause();
8422       return 0;
8423     }
8424   } else {
8425     // error message about invalid level
8426     return 1;
8427   }
8428 }
8429 
8430 
8431 void __kmp_omp_display_env(int verbose) {
8432   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8433   if (__kmp_init_serial == 0)
8434     __kmp_do_serial_initialize();
8435   __kmp_display_env_impl(!verbose, verbose);
8436   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8437 }
8438