1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109   int i;
110   kmp_info_t **other_threads;
111   size_t stack_data;
112   char *stack_addr;
113   size_t stack_size;
114   char *stack_base;
115 
116   KA_TRACE(
117       1000,
118       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
119        __kmp_nth, __kmp_all_nth));
120 
121   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124      __kmp_init_gtid for this to work. */
125 
126   if (!TCR_4(__kmp_init_gtid))
127     return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130   if (TCR_4(__kmp_gtid_mode) >= 3) {
131     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132     return __kmp_gtid;
133   }
134 #endif
135   if (TCR_4(__kmp_gtid_mode) >= 2) {
136     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137     return __kmp_gtid_get_specific();
138   }
139   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141   stack_addr = (char *)&stack_data;
142   other_threads = __kmp_threads;
143 
144   /* ATT: The code below is a source of potential bugs due to unsynchronized
145      access to __kmp_threads array. For example:
146      1. Current thread loads other_threads[i] to thr and checks it, it is
147         non-NULL.
148      2. Current thread is suspended by OS.
149      3. Another thread unregisters and finishes (debug versions of free()
150         may fill memory with something like 0xEF).
151      4. Current thread is resumed.
152      5. Current thread reads junk from *thr.
153      TODO: Fix it.  --ln  */
154 
155   for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158     if (!thr)
159       continue;
160 
161     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164     /* stack grows down -- search through all of the active threads */
165 
166     if (stack_addr <= stack_base) {
167       size_t stack_diff = stack_base - stack_addr;
168 
169       if (stack_diff <= stack_size) {
170         /* The only way we can be closer than the allocated */
171         /* stack size is if we are running on this thread. */
172         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173         return i;
174       }
175     }
176   }
177 
178   /* get specific to try and determine our gtid */
179   KA_TRACE(1000,
180            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181             "thread, using TLS\n"));
182   i = __kmp_gtid_get_specific();
183 
184   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
185 
186   /* if we havn't been assigned a gtid, then return code */
187   if (i < 0)
188     return i;
189 
190   /* dynamically updated stack window for uber threads to avoid get_specific
191      call */
192   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193     KMP_FATAL(StackOverflow, i);
194   }
195 
196   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197   if (stack_addr > stack_base) {
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201                 stack_base);
202   } else {
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             stack_base - stack_addr);
205   }
206 
207   /* Reprint stack bounds for ubermaster since they have been refined */
208   if (__kmp_storage_map) {
209     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212                                  other_threads[i]->th.th_info.ds.ds_stacksize,
213                                  "th_%d stack (refinement)", i);
214   }
215   return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219   int gtid;
220 
221   if (!__kmp_init_serial) {
222     gtid = KMP_GTID_DNE;
223   } else
224 #ifdef KMP_TDATA_GTID
225       if (TCR_4(__kmp_gtid_mode) >= 3) {
226     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227     gtid = __kmp_gtid;
228   } else
229 #endif
230       if (TCR_4(__kmp_gtid_mode) >= 2) {
231     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232     gtid = __kmp_gtid_get_specific();
233   } else {
234     KA_TRACE(1000,
235              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236     gtid = __kmp_get_global_thread_id();
237   }
238 
239   /* we must be a new uber master sibling thread */
240   if (gtid == KMP_GTID_DNE) {
241     KA_TRACE(10,
242              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243               "Registering a new gtid.\n"));
244     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245     if (!__kmp_init_serial) {
246       __kmp_do_serial_initialize();
247       gtid = __kmp_gtid_get_specific();
248     } else {
249       gtid = __kmp_register_root(FALSE);
250     }
251     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253   }
254 
255   KMP_DEBUG_ASSERT(gtid >= 0);
256 
257   return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262   int f;
263   char *stack_beg = NULL;
264   char *stack_end = NULL;
265   int gtid;
266 
267   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268   if (__kmp_storage_map) {
269     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272     gtid = __kmp_gtid_from_thread(th);
273 
274     if (gtid == KMP_GTID_MONITOR) {
275       __kmp_print_storage_map_gtid(
276           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277           "th_%s stack (%s)", "mon",
278           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279     } else {
280       __kmp_print_storage_map_gtid(
281           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282           "th_%d stack (%s)", gtid,
283           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284     }
285   }
286 
287   /* No point in checking ubermaster threads since they use refinement and
288    * cannot overlap */
289   gtid = __kmp_gtid_from_thread(th);
290   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291     KA_TRACE(10,
292              ("__kmp_check_stack_overlap: performing extensive checking\n"));
293     if (stack_beg == NULL) {
294       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296     }
297 
298     for (f = 0; f < __kmp_threads_capacity; f++) {
299       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301       if (f_th && f_th != th) {
302         char *other_stack_end =
303             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304         char *other_stack_beg =
305             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309           /* Print the other stack values before the abort */
310           if (__kmp_storage_map)
311             __kmp_print_storage_map_gtid(
312                 -1, other_stack_beg, other_stack_end,
313                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317                       __kmp_msg_null);
318         }
319       }
320     }
321   }
322   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328   static int done = FALSE;
329 
330   while (!done) {
331     KMP_YIELD(TRUE);
332   }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338                                   char const *format, ...) {
339   char buffer[MAX_MESSAGE];
340   va_list ap;
341 
342   va_start(ap, format);
343   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344                p2, (unsigned long)size, format);
345   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346   __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348   int node;
349   if (gtid >= 0) {
350     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351       if (__kmp_storage_map_verbose) {
352         node = __kmp_get_host_node(p1);
353         if (node < 0) /* doesn't work, so don't try this next time */
354           __kmp_storage_map_verbose = FALSE;
355         else {
356           char *last;
357           int lastNode;
358           int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360           const int page_size = KMP_GET_PAGE_SIZE();
361 
362           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364           if (localProc >= 0)
365             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
366                                  localProc >> 1);
367           else
368             __kmp_printf_no_lock("  GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370           /* The more elaborate format is disabled for now because of the prctl
371            * hanging bug. */
372           do {
373             last = p1;
374             lastNode = node;
375             /* This loop collates adjacent pages with the same host node. */
376             do {
377               (char *)p1 += page_size;
378             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
380                                  lastNode);
381           } while (p1 <= p2);
382 #else
383           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
384                                (char *)p1 + (page_size - 1),
385                                __kmp_get_host_node(p1));
386           if (p1 < p2) {
387             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
388                                  (char *)p2 + (page_size - 1),
389                                  __kmp_get_host_node(p2));
390           }
391 #endif
392         }
393       }
394     } else
395       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
396   }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402   char buffer[MAX_MESSAGE];
403   va_list ap;
404 
405   if (__kmp_generate_warnings == kmp_warnings_off) {
406     return;
407   }
408 
409   va_start(ap, format);
410 
411   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413   __kmp_vprintf(kmp_err, buffer, ap);
414   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416   va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420   // Later threads may stall here, but that's ok because abort() will kill them.
421   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423   if (__kmp_debug_buf) {
424     __kmp_dump_debug_buffer();
425   }
426 
427   if (KMP_OS_WINDOWS) {
428     // Let other threads know of abnormal termination and prevent deadlock
429     // if abort happened during library initialization or shutdown
430     __kmp_global.g.g_abort = SIGABRT;
431 
432     /* On Windows* OS by default abort() causes pop-up error box, which stalls
433        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434        boxes. _set_abort_behavior() works well, but this function is not
435        available in VS7 (this is not problem for DLL, but it is a problem for
436        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437        help, at least in some versions of MS C RTL.
438 
439        It seems following sequence is the only way to simulate abort() and
440        avoid pop-up error box. */
441     raise(SIGABRT);
442     _exit(3); // Just in case, if signal ignored, exit anyway.
443   } else {
444     __kmp_unregister_library();
445     abort();
446   }
447 
448   __kmp_infinite_loop();
449   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
450 
451 } // __kmp_abort_process
452 
453 void __kmp_abort_thread(void) {
454   // TODO: Eliminate g_abort global variable and this function.
455   // In case of abort just call abort(), it will kill all the threads.
456   __kmp_infinite_loop();
457 } // __kmp_abort_thread
458 
459 /* Print out the storage map for the major kmp_info_t thread data structures
460    that are allocated together. */
461 
462 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
463   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
464                                gtid);
465 
466   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
467                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
470                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
471 
472   __kmp_print_storage_map_gtid(
473       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
474       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
477                                &thr->th.th_bar[bs_plain_barrier + 1],
478                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
479                                gtid);
480 
481   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
482                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
483                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
484                                gtid);
485 
486 #if KMP_FAST_REDUCTION_BARRIER
487   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
488                                &thr->th.th_bar[bs_reduction_barrier + 1],
489                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
490                                gtid);
491 #endif // KMP_FAST_REDUCTION_BARRIER
492 }
493 
494 /* Print out the storage map for the major kmp_team_t team data structures
495    that are allocated together. */
496 
497 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
498                                          int team_id, int num_thr) {
499   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
500   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
501                                header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
504                                &team->t.t_bar[bs_last_barrier],
505                                sizeof(kmp_balign_team_t) * bs_last_barrier,
506                                "%s_%d.t_bar", header, team_id);
507 
508   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
509                                &team->t.t_bar[bs_plain_barrier + 1],
510                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
511                                header, team_id);
512 
513   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
514                                &team->t.t_bar[bs_forkjoin_barrier + 1],
515                                sizeof(kmp_balign_team_t),
516                                "%s_%d.t_bar[forkjoin]", header, team_id);
517 
518 #if KMP_FAST_REDUCTION_BARRIER
519   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
520                                &team->t.t_bar[bs_reduction_barrier + 1],
521                                sizeof(kmp_balign_team_t),
522                                "%s_%d.t_bar[reduction]", header, team_id);
523 #endif // KMP_FAST_REDUCTION_BARRIER
524 
525   __kmp_print_storage_map_gtid(
526       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
527       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
531       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
532 
533   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
534                                &team->t.t_disp_buffer[num_disp_buff],
535                                sizeof(dispatch_shared_info_t) * num_disp_buff,
536                                "%s_%d.t_disp_buffer", header, team_id);
537 }
538 
539 static void __kmp_init_allocator() { __kmp_init_memkind(); }
540 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
541 
542 /* ------------------------------------------------------------------------ */
543 
544 #if KMP_DYNAMIC_LIB
545 #if KMP_OS_WINDOWS
546 
547 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
548   // TODO: Change to __kmp_break_bootstrap_lock().
549   __kmp_init_bootstrap_lock(lck); // make the lock released
550 }
551 
552 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
553   int i;
554   int thread_count;
555 
556   // PROCESS_DETACH is expected to be called by a thread that executes
557   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
558   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
559   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
560   // threads can be still alive here, although being about to be terminated. The
561   // threads in the array with ds_thread==0 are most suspicious. Actually, it
562   // can be not safe to access the __kmp_threads[].
563 
564   // TODO: does it make sense to check __kmp_roots[] ?
565 
566   // Let's check that there are no other alive threads registered with the OMP
567   // lib.
568   while (1) {
569     thread_count = 0;
570     for (i = 0; i < __kmp_threads_capacity; ++i) {
571       if (!__kmp_threads)
572         continue;
573       kmp_info_t *th = __kmp_threads[i];
574       if (th == NULL)
575         continue;
576       int gtid = th->th.th_info.ds.ds_gtid;
577       if (gtid == gtid_req)
578         continue;
579       if (gtid < 0)
580         continue;
581       DWORD exit_val;
582       int alive = __kmp_is_thread_alive(th, &exit_val);
583       if (alive) {
584         ++thread_count;
585       }
586     }
587     if (thread_count == 0)
588       break; // success
589   }
590 
591   // Assume that I'm alone. Now it might be safe to check and reset locks.
592   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
593   __kmp_reset_lock(&__kmp_forkjoin_lock);
594 #ifdef KMP_DEBUG
595   __kmp_reset_lock(&__kmp_stdio_lock);
596 #endif // KMP_DEBUG
597 }
598 
599 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
600   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
601 
602   switch (fdwReason) {
603 
604   case DLL_PROCESS_ATTACH:
605     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
606 
607     return TRUE;
608 
609   case DLL_PROCESS_DETACH:
610     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
611 
612     if (lpReserved != NULL) {
613       // lpReserved is used for telling the difference:
614       //   lpReserved == NULL when FreeLibrary() was called,
615       //   lpReserved != NULL when the process terminates.
616       // When FreeLibrary() is called, worker threads remain alive. So they will
617       // release the forkjoin lock by themselves. When the process terminates,
618       // worker threads disappear triggering the problem of unreleased forkjoin
619       // lock as described below.
620 
621       // A worker thread can take the forkjoin lock. The problem comes up if
622       // that worker thread becomes dead before it releases the forkjoin lock.
623       // The forkjoin lock remains taken, while the thread executing
624       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
625       // to take the forkjoin lock and will always fail, so that the application
626       // will never finish [normally]. This scenario is possible if
627       // __kmpc_end() has not been executed. It looks like it's not a corner
628       // case, but common cases:
629       // - the main function was compiled by an alternative compiler;
630       // - the main function was compiled by icl but without /Qopenmp
631       //   (application with plugins);
632       // - application terminates by calling C exit(), Fortran CALL EXIT() or
633       //   Fortran STOP.
634       // - alive foreign thread prevented __kmpc_end from doing cleanup.
635       //
636       // This is a hack to work around the problem.
637       // TODO: !!! figure out something better.
638       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
639     }
640 
641     __kmp_internal_end_library(__kmp_gtid_get_specific());
642 
643     return TRUE;
644 
645   case DLL_THREAD_ATTACH:
646     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
647 
648     /* if we want to register new siblings all the time here call
649      * __kmp_get_gtid(); */
650     return TRUE;
651 
652   case DLL_THREAD_DETACH:
653     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
654 
655     __kmp_internal_end_thread(__kmp_gtid_get_specific());
656     return TRUE;
657   }
658 
659   return TRUE;
660 }
661 
662 #endif /* KMP_OS_WINDOWS */
663 #endif /* KMP_DYNAMIC_LIB */
664 
665 /* __kmp_parallel_deo -- Wait until it's our turn. */
666 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
667   int gtid = *gtid_ref;
668 #ifdef BUILD_PARALLEL_ORDERED
669   kmp_team_t *team = __kmp_team_from_gtid(gtid);
670 #endif /* BUILD_PARALLEL_ORDERED */
671 
672   if (__kmp_env_consistency_check) {
673     if (__kmp_threads[gtid]->th.th_root->r.r_active)
674 #if KMP_USE_DYNAMIC_LOCK
675       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
676 #else
677       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
678 #endif
679   }
680 #ifdef BUILD_PARALLEL_ORDERED
681   if (!team->t.t_serialized) {
682     KMP_MB();
683     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
684              NULL);
685     KMP_MB();
686   }
687 #endif /* BUILD_PARALLEL_ORDERED */
688 }
689 
690 /* __kmp_parallel_dxo -- Signal the next task. */
691 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
692   int gtid = *gtid_ref;
693 #ifdef BUILD_PARALLEL_ORDERED
694   int tid = __kmp_tid_from_gtid(gtid);
695   kmp_team_t *team = __kmp_team_from_gtid(gtid);
696 #endif /* BUILD_PARALLEL_ORDERED */
697 
698   if (__kmp_env_consistency_check) {
699     if (__kmp_threads[gtid]->th.th_root->r.r_active)
700       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
701   }
702 #ifdef BUILD_PARALLEL_ORDERED
703   if (!team->t.t_serialized) {
704     KMP_MB(); /* Flush all pending memory write invalidates.  */
705 
706     /* use the tid of the next thread in this team */
707     /* TODO replace with general release procedure */
708     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
709 
710     KMP_MB(); /* Flush all pending memory write invalidates.  */
711   }
712 #endif /* BUILD_PARALLEL_ORDERED */
713 }
714 
715 /* ------------------------------------------------------------------------ */
716 /* The BARRIER for a SINGLE process section is always explicit   */
717 
718 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
719   int status;
720   kmp_info_t *th;
721   kmp_team_t *team;
722 
723   if (!TCR_4(__kmp_init_parallel))
724     __kmp_parallel_initialize();
725   __kmp_resume_if_soft_paused();
726 
727   th = __kmp_threads[gtid];
728   team = th->th.th_team;
729   status = 0;
730 
731   th->th.th_ident = id_ref;
732 
733   if (team->t.t_serialized) {
734     status = 1;
735   } else {
736     kmp_int32 old_this = th->th.th_local.this_construct;
737 
738     ++th->th.th_local.this_construct;
739     /* try to set team count to thread count--success means thread got the
740        single block */
741     /* TODO: Should this be acquire or release? */
742     if (team->t.t_construct == old_this) {
743       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
744                                               th->th.th_local.this_construct);
745     }
746 #if USE_ITT_BUILD
747     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
748         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
749         team->t.t_active_level ==
750             1) { // Only report metadata by master of active team at level 1
751       __kmp_itt_metadata_single(id_ref);
752     }
753 #endif /* USE_ITT_BUILD */
754   }
755 
756   if (__kmp_env_consistency_check) {
757     if (status && push_ws) {
758       __kmp_push_workshare(gtid, ct_psingle, id_ref);
759     } else {
760       __kmp_check_workshare(gtid, ct_psingle, id_ref);
761     }
762   }
763 #if USE_ITT_BUILD
764   if (status) {
765     __kmp_itt_single_start(gtid);
766   }
767 #endif /* USE_ITT_BUILD */
768   return status;
769 }
770 
771 void __kmp_exit_single(int gtid) {
772 #if USE_ITT_BUILD
773   __kmp_itt_single_end(gtid);
774 #endif /* USE_ITT_BUILD */
775   if (__kmp_env_consistency_check)
776     __kmp_pop_workshare(gtid, ct_psingle, NULL);
777 }
778 
779 /* determine if we can go parallel or must use a serialized parallel region and
780  * how many threads we can use
781  * set_nproc is the number of threads requested for the team
782  * returns 0 if we should serialize or only use one thread,
783  * otherwise the number of threads to use
784  * The forkjoin lock is held by the caller. */
785 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
786                                  int master_tid, int set_nthreads,
787                                  int enter_teams) {
788   int capacity;
789   int new_nthreads;
790   KMP_DEBUG_ASSERT(__kmp_init_serial);
791   KMP_DEBUG_ASSERT(root && parent_team);
792   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
793 
794   // If dyn-var is set, dynamically adjust the number of desired threads,
795   // according to the method specified by dynamic_mode.
796   new_nthreads = set_nthreads;
797   if (!get__dynamic_2(parent_team, master_tid)) {
798     ;
799   }
800 #ifdef USE_LOAD_BALANCE
801   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
802     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
803     if (new_nthreads == 1) {
804       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
805                     "reservation to 1 thread\n",
806                     master_tid));
807       return 1;
808     }
809     if (new_nthreads < set_nthreads) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
811                     "reservation to %d threads\n",
812                     master_tid, new_nthreads));
813     }
814   }
815 #endif /* USE_LOAD_BALANCE */
816   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
817     new_nthreads = __kmp_avail_proc - __kmp_nth +
818                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
819     if (new_nthreads <= 1) {
820       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
821                     "reservation to 1 thread\n",
822                     master_tid));
823       return 1;
824     }
825     if (new_nthreads < set_nthreads) {
826       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
827                     "reservation to %d threads\n",
828                     master_tid, new_nthreads));
829     } else {
830       new_nthreads = set_nthreads;
831     }
832   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
833     if (set_nthreads > 2) {
834       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
835       new_nthreads = (new_nthreads % set_nthreads) + 1;
836       if (new_nthreads == 1) {
837         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
838                       "reservation to 1 thread\n",
839                       master_tid));
840         return 1;
841       }
842       if (new_nthreads < set_nthreads) {
843         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
844                       "reservation to %d threads\n",
845                       master_tid, new_nthreads));
846       }
847     }
848   } else {
849     KMP_ASSERT(0);
850   }
851 
852   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
853   if (__kmp_nth + new_nthreads -
854           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
855       __kmp_max_nth) {
856     int tl_nthreads = __kmp_max_nth - __kmp_nth +
857                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
858     if (tl_nthreads <= 0) {
859       tl_nthreads = 1;
860     }
861 
862     // If dyn-var is false, emit a 1-time warning.
863     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
864       __kmp_reserve_warn = 1;
865       __kmp_msg(kmp_ms_warning,
866                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
867                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
868     }
869     if (tl_nthreads == 1) {
870       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
871                     "reduced reservation to 1 thread\n",
872                     master_tid));
873       return 1;
874     }
875     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
876                   "reservation to %d threads\n",
877                   master_tid, tl_nthreads));
878     new_nthreads = tl_nthreads;
879   }
880 
881   // Respect OMP_THREAD_LIMIT
882   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
883   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
884   if (cg_nthreads + new_nthreads -
885           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
886       max_cg_threads) {
887     int tl_nthreads = max_cg_threads - cg_nthreads +
888                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
889     if (tl_nthreads <= 0) {
890       tl_nthreads = 1;
891     }
892 
893     // If dyn-var is false, emit a 1-time warning.
894     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
895       __kmp_reserve_warn = 1;
896       __kmp_msg(kmp_ms_warning,
897                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
898                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
899     }
900     if (tl_nthreads == 1) {
901       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
902                     "reduced reservation to 1 thread\n",
903                     master_tid));
904       return 1;
905     }
906     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
907                   "reservation to %d threads\n",
908                   master_tid, tl_nthreads));
909     new_nthreads = tl_nthreads;
910   }
911 
912   // Check if the threads array is large enough, or needs expanding.
913   // See comment in __kmp_register_root() about the adjustment if
914   // __kmp_threads[0] == NULL.
915   capacity = __kmp_threads_capacity;
916   if (TCR_PTR(__kmp_threads[0]) == NULL) {
917     --capacity;
918   }
919   if (__kmp_nth + new_nthreads -
920           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
921       capacity) {
922     // Expand the threads array.
923     int slotsRequired = __kmp_nth + new_nthreads -
924                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
925                         capacity;
926     int slotsAdded = __kmp_expand_threads(slotsRequired);
927     if (slotsAdded < slotsRequired) {
928       // The threads array was not expanded enough.
929       new_nthreads -= (slotsRequired - slotsAdded);
930       KMP_ASSERT(new_nthreads >= 1);
931 
932       // If dyn-var is false, emit a 1-time warning.
933       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
934         __kmp_reserve_warn = 1;
935         if (__kmp_tp_cached) {
936           __kmp_msg(kmp_ms_warning,
937                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
938                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
939                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
940         } else {
941           __kmp_msg(kmp_ms_warning,
942                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
943                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
944         }
945       }
946     }
947   }
948 
949 #ifdef KMP_DEBUG
950   if (new_nthreads == 1) {
951     KC_TRACE(10,
952              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
953               "dead roots and rechecking; requested %d threads\n",
954               __kmp_get_gtid(), set_nthreads));
955   } else {
956     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
957                   " %d threads\n",
958                   __kmp_get_gtid(), new_nthreads, set_nthreads));
959   }
960 #endif // KMP_DEBUG
961   return new_nthreads;
962 }
963 
964 /* Allocate threads from the thread pool and assign them to the new team. We are
965    assured that there are enough threads available, because we checked on that
966    earlier within critical section forkjoin */
967 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
968                                     kmp_info_t *master_th, int master_gtid) {
969   int i;
970   int use_hot_team;
971 
972   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
973   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
974   KMP_MB();
975 
976   /* first, let's setup the master thread */
977   master_th->th.th_info.ds.ds_tid = 0;
978   master_th->th.th_team = team;
979   master_th->th.th_team_nproc = team->t.t_nproc;
980   master_th->th.th_team_master = master_th;
981   master_th->th.th_team_serialized = FALSE;
982   master_th->th.th_dispatch = &team->t.t_dispatch[0];
983 
984 /* make sure we are not the optimized hot team */
985 #if KMP_NESTED_HOT_TEAMS
986   use_hot_team = 0;
987   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
988   if (hot_teams) { // hot teams array is not allocated if
989     // KMP_HOT_TEAMS_MAX_LEVEL=0
990     int level = team->t.t_active_level - 1; // index in array of hot teams
991     if (master_th->th.th_teams_microtask) { // are we inside the teams?
992       if (master_th->th.th_teams_size.nteams > 1) {
993         ++level; // level was not increased in teams construct for
994         // team_of_masters
995       }
996       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
997           master_th->th.th_teams_level == team->t.t_level) {
998         ++level; // level was not increased in teams construct for
999         // team_of_workers before the parallel
1000       } // team->t.t_level will be increased inside parallel
1001     }
1002     if (level < __kmp_hot_teams_max_level) {
1003       if (hot_teams[level].hot_team) {
1004         // hot team has already been allocated for given level
1005         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1006         use_hot_team = 1; // the team is ready to use
1007       } else {
1008         use_hot_team = 0; // AC: threads are not allocated yet
1009         hot_teams[level].hot_team = team; // remember new hot team
1010         hot_teams[level].hot_team_nth = team->t.t_nproc;
1011       }
1012     } else {
1013       use_hot_team = 0;
1014     }
1015   }
1016 #else
1017   use_hot_team = team == root->r.r_hot_team;
1018 #endif
1019   if (!use_hot_team) {
1020 
1021     /* install the master thread */
1022     team->t.t_threads[0] = master_th;
1023     __kmp_initialize_info(master_th, team, 0, master_gtid);
1024 
1025     /* now, install the worker threads */
1026     for (i = 1; i < team->t.t_nproc; i++) {
1027 
1028       /* fork or reallocate a new thread and install it in team */
1029       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1030       team->t.t_threads[i] = thr;
1031       KMP_DEBUG_ASSERT(thr);
1032       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1033       /* align team and thread arrived states */
1034       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1035                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1036                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1037                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1038                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1039                     team->t.t_bar[bs_plain_barrier].b_arrived));
1040       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1041       thr->th.th_teams_level = master_th->th.th_teams_level;
1042       thr->th.th_teams_size = master_th->th.th_teams_size;
1043       { // Initialize threads' barrier data.
1044         int b;
1045         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1046         for (b = 0; b < bs_last_barrier; ++b) {
1047           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1048           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1049 #if USE_DEBUGGER
1050           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1051 #endif
1052         }
1053       }
1054     }
1055 
1056 #if KMP_AFFINITY_SUPPORTED
1057     __kmp_partition_places(team);
1058 #endif
1059   }
1060 
1061   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1062     for (i = 0; i < team->t.t_nproc; i++) {
1063       kmp_info_t *thr = team->t.t_threads[i];
1064       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1065           thr->th.th_prev_level != team->t.t_level) {
1066         team->t.t_display_affinity = 1;
1067         break;
1068       }
1069     }
1070   }
1071 
1072   KMP_MB();
1073 }
1074 
1075 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1076 // Propagate any changes to the floating point control registers out to the team
1077 // We try to avoid unnecessary writes to the relevant cache line in the team
1078 // structure, so we don't make changes unless they are needed.
1079 inline static void propagateFPControl(kmp_team_t *team) {
1080   if (__kmp_inherit_fp_control) {
1081     kmp_int16 x87_fpu_control_word;
1082     kmp_uint32 mxcsr;
1083 
1084     // Get master values of FPU control flags (both X87 and vector)
1085     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1086     __kmp_store_mxcsr(&mxcsr);
1087     mxcsr &= KMP_X86_MXCSR_MASK;
1088 
1089     // There is no point looking at t_fp_control_saved here.
1090     // If it is TRUE, we still have to update the values if they are different
1091     // from those we now have. If it is FALSE we didn't save anything yet, but
1092     // our objective is the same. We have to ensure that the values in the team
1093     // are the same as those we have.
1094     // So, this code achieves what we need whether or not t_fp_control_saved is
1095     // true. By checking whether the value needs updating we avoid unnecessary
1096     // writes that would put the cache-line into a written state, causing all
1097     // threads in the team to have to read it again.
1098     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1099     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1100     // Although we don't use this value, other code in the runtime wants to know
1101     // whether it should restore them. So we must ensure it is correct.
1102     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1103   } else {
1104     // Similarly here. Don't write to this cache-line in the team structure
1105     // unless we have to.
1106     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1107   }
1108 }
1109 
1110 // Do the opposite, setting the hardware registers to the updated values from
1111 // the team.
1112 inline static void updateHWFPControl(kmp_team_t *team) {
1113   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1114     // Only reset the fp control regs if they have been changed in the team.
1115     // the parallel region that we are exiting.
1116     kmp_int16 x87_fpu_control_word;
1117     kmp_uint32 mxcsr;
1118     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1119     __kmp_store_mxcsr(&mxcsr);
1120     mxcsr &= KMP_X86_MXCSR_MASK;
1121 
1122     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1123       __kmp_clear_x87_fpu_status_word();
1124       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1125     }
1126 
1127     if (team->t.t_mxcsr != mxcsr) {
1128       __kmp_load_mxcsr(&team->t.t_mxcsr);
1129     }
1130   }
1131 }
1132 #else
1133 #define propagateFPControl(x) ((void)0)
1134 #define updateHWFPControl(x) ((void)0)
1135 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1136 
1137 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1138                                      int realloc); // forward declaration
1139 
1140 /* Run a parallel region that has been serialized, so runs only in a team of the
1141    single master thread. */
1142 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1143   kmp_info_t *this_thr;
1144   kmp_team_t *serial_team;
1145 
1146   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1147 
1148   /* Skip all this code for autopar serialized loops since it results in
1149      unacceptable overhead */
1150   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1151     return;
1152 
1153   if (!TCR_4(__kmp_init_parallel))
1154     __kmp_parallel_initialize();
1155   __kmp_resume_if_soft_paused();
1156 
1157   this_thr = __kmp_threads[global_tid];
1158   serial_team = this_thr->th.th_serial_team;
1159 
1160   /* utilize the serialized team held by this thread */
1161   KMP_DEBUG_ASSERT(serial_team);
1162   KMP_MB();
1163 
1164   if (__kmp_tasking_mode != tskm_immediate_exec) {
1165     KMP_DEBUG_ASSERT(
1166         this_thr->th.th_task_team ==
1167         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1168     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1169                      NULL);
1170     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1171                   "team %p, new task_team = NULL\n",
1172                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1173     this_thr->th.th_task_team = NULL;
1174   }
1175 
1176   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1177   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1178     proc_bind = proc_bind_false;
1179   } else if (proc_bind == proc_bind_default) {
1180     // No proc_bind clause was specified, so use the current value
1181     // of proc-bind-var for this parallel region.
1182     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1183   }
1184   // Reset for next parallel region
1185   this_thr->th.th_set_proc_bind = proc_bind_default;
1186 
1187 #if OMPT_SUPPORT
1188   ompt_data_t ompt_parallel_data = ompt_data_none;
1189   ompt_data_t *implicit_task_data;
1190   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1191   if (ompt_enabled.enabled &&
1192       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1193 
1194     ompt_task_info_t *parent_task_info;
1195     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1196 
1197     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1198     if (ompt_enabled.ompt_callback_parallel_begin) {
1199       int team_size = 1;
1200 
1201       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1202           &(parent_task_info->task_data), &(parent_task_info->frame),
1203           &ompt_parallel_data, team_size,
1204           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1205     }
1206   }
1207 #endif // OMPT_SUPPORT
1208 
1209   if (this_thr->th.th_team != serial_team) {
1210     // Nested level will be an index in the nested nthreads array
1211     int level = this_thr->th.th_team->t.t_level;
1212 
1213     if (serial_team->t.t_serialized) {
1214       /* this serial team was already used
1215          TODO increase performance by making this locks more specific */
1216       kmp_team_t *new_team;
1217 
1218       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1219 
1220       new_team =
1221           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1222 #if OMPT_SUPPORT
1223                               ompt_parallel_data,
1224 #endif
1225                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1226                               0 USE_NESTED_HOT_ARG(NULL));
1227       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1228       KMP_ASSERT(new_team);
1229 
1230       /* setup new serialized team and install it */
1231       new_team->t.t_threads[0] = this_thr;
1232       new_team->t.t_parent = this_thr->th.th_team;
1233       serial_team = new_team;
1234       this_thr->th.th_serial_team = serial_team;
1235 
1236       KF_TRACE(
1237           10,
1238           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1239            global_tid, serial_team));
1240 
1241       /* TODO the above breaks the requirement that if we run out of resources,
1242          then we can still guarantee that serialized teams are ok, since we may
1243          need to allocate a new one */
1244     } else {
1245       KF_TRACE(
1246           10,
1247           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1248            global_tid, serial_team));
1249     }
1250 
1251     /* we have to initialize this serial team */
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1253     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1254     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1255     serial_team->t.t_ident = loc;
1256     serial_team->t.t_serialized = 1;
1257     serial_team->t.t_nproc = 1;
1258     serial_team->t.t_parent = this_thr->th.th_team;
1259     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1260     this_thr->th.th_team = serial_team;
1261     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1262 
1263     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1264                   this_thr->th.th_current_task));
1265     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1266     this_thr->th.th_current_task->td_flags.executing = 0;
1267 
1268     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1269 
1270     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1271        implicit task for each serialized task represented by
1272        team->t.t_serialized? */
1273     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1274               &this_thr->th.th_current_task->td_parent->td_icvs);
1275 
1276     // Thread value exists in the nested nthreads array for the next nested
1277     // level
1278     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1279       this_thr->th.th_current_task->td_icvs.nproc =
1280           __kmp_nested_nth.nth[level + 1];
1281     }
1282 
1283     if (__kmp_nested_proc_bind.used &&
1284         (level + 1 < __kmp_nested_proc_bind.used)) {
1285       this_thr->th.th_current_task->td_icvs.proc_bind =
1286           __kmp_nested_proc_bind.bind_types[level + 1];
1287     }
1288 
1289 #if USE_DEBUGGER
1290     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1291 #endif
1292     this_thr->th.th_info.ds.ds_tid = 0;
1293 
1294     /* set thread cache values */
1295     this_thr->th.th_team_nproc = 1;
1296     this_thr->th.th_team_master = this_thr;
1297     this_thr->th.th_team_serialized = 1;
1298 
1299     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1300     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1301     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1302 
1303     propagateFPControl(serial_team);
1304 
1305     /* check if we need to allocate dispatch buffers stack */
1306     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1307     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1308       serial_team->t.t_dispatch->th_disp_buffer =
1309           (dispatch_private_info_t *)__kmp_allocate(
1310               sizeof(dispatch_private_info_t));
1311     }
1312     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1313 
1314     KMP_MB();
1315 
1316   } else {
1317     /* this serialized team is already being used,
1318      * that's fine, just add another nested level */
1319     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1321     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1322     ++serial_team->t.t_serialized;
1323     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1324 
1325     // Nested level will be an index in the nested nthreads array
1326     int level = this_thr->th.th_team->t.t_level;
1327     // Thread value exists in the nested nthreads array for the next nested
1328     // level
1329     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1330       this_thr->th.th_current_task->td_icvs.nproc =
1331           __kmp_nested_nth.nth[level + 1];
1332     }
1333     serial_team->t.t_level++;
1334     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1335                   "of serial team %p to %d\n",
1336                   global_tid, serial_team, serial_team->t.t_level));
1337 
1338     /* allocate/push dispatch buffers stack */
1339     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1340     {
1341       dispatch_private_info_t *disp_buffer =
1342           (dispatch_private_info_t *)__kmp_allocate(
1343               sizeof(dispatch_private_info_t));
1344       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1345       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1346     }
1347     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1348 
1349     KMP_MB();
1350   }
1351   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1352 
1353   // Perform the display affinity functionality for
1354   // serialized parallel regions
1355   if (__kmp_display_affinity) {
1356     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1357         this_thr->th.th_prev_num_threads != 1) {
1358       // NULL means use the affinity-format-var ICV
1359       __kmp_aux_display_affinity(global_tid, NULL);
1360       this_thr->th.th_prev_level = serial_team->t.t_level;
1361       this_thr->th.th_prev_num_threads = 1;
1362     }
1363   }
1364 
1365   if (__kmp_env_consistency_check)
1366     __kmp_push_parallel(global_tid, NULL);
1367 #if OMPT_SUPPORT
1368   serial_team->t.ompt_team_info.master_return_address = codeptr;
1369   if (ompt_enabled.enabled &&
1370       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1371     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1372 
1373     ompt_lw_taskteam_t lw_taskteam;
1374     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1375                             &ompt_parallel_data, codeptr);
1376 
1377     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1378     // don't use lw_taskteam after linking. content was swaped
1379 
1380     /* OMPT implicit task begin */
1381     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1382     if (ompt_enabled.ompt_callback_implicit_task) {
1383       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1384           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1385           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1386       OMPT_CUR_TASK_INFO(this_thr)
1387           ->thread_num = __kmp_tid_from_gtid(global_tid);
1388     }
1389 
1390     /* OMPT state */
1391     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1392     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1393   }
1394 #endif
1395 }
1396 
1397 /* most of the work for a fork */
1398 /* return true if we really went parallel, false if serialized */
1399 int __kmp_fork_call(ident_t *loc, int gtid,
1400                     enum fork_context_e call_context, // Intel, GNU, ...
1401                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1402                     kmp_va_list ap) {
1403   void **argv;
1404   int i;
1405   int master_tid;
1406   int master_this_cons;
1407   kmp_team_t *team;
1408   kmp_team_t *parent_team;
1409   kmp_info_t *master_th;
1410   kmp_root_t *root;
1411   int nthreads;
1412   int master_active;
1413   int master_set_numthreads;
1414   int level;
1415   int active_level;
1416   int teams_level;
1417 #if KMP_NESTED_HOT_TEAMS
1418   kmp_hot_team_ptr_t **p_hot_teams;
1419 #endif
1420   { // KMP_TIME_BLOCK
1421     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1422     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1423 
1424     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1425     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1426       /* Some systems prefer the stack for the root thread(s) to start with */
1427       /* some gap from the parent stack to prevent false sharing. */
1428       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1429       /* These 2 lines below are so this does not get optimized out */
1430       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1431         __kmp_stkpadding += (short)((kmp_int64)dummy);
1432     }
1433 
1434     /* initialize if needed */
1435     KMP_DEBUG_ASSERT(
1436         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1437     if (!TCR_4(__kmp_init_parallel))
1438       __kmp_parallel_initialize();
1439     __kmp_resume_if_soft_paused();
1440 
1441     /* setup current data */
1442     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1443     // shutdown
1444     parent_team = master_th->th.th_team;
1445     master_tid = master_th->th.th_info.ds.ds_tid;
1446     master_this_cons = master_th->th.th_local.this_construct;
1447     root = master_th->th.th_root;
1448     master_active = root->r.r_active;
1449     master_set_numthreads = master_th->th.th_set_nproc;
1450 
1451 #if OMPT_SUPPORT
1452     ompt_data_t ompt_parallel_data = ompt_data_none;
1453     ompt_data_t *parent_task_data;
1454     ompt_frame_t *ompt_frame;
1455     ompt_data_t *implicit_task_data;
1456     void *return_address = NULL;
1457 
1458     if (ompt_enabled.enabled) {
1459       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1460                                     NULL, NULL);
1461       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1462     }
1463 #endif
1464 
1465     // Nested level will be an index in the nested nthreads array
1466     level = parent_team->t.t_level;
1467     // used to launch non-serial teams even if nested is not allowed
1468     active_level = parent_team->t.t_active_level;
1469     // needed to check nesting inside the teams
1470     teams_level = master_th->th.th_teams_level;
1471 #if KMP_NESTED_HOT_TEAMS
1472     p_hot_teams = &master_th->th.th_hot_teams;
1473     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1474       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1475           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1476       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1477       // it is either actual or not needed (when active_level > 0)
1478       (*p_hot_teams)[0].hot_team_nth = 1;
1479     }
1480 #endif
1481 
1482 #if OMPT_SUPPORT
1483     if (ompt_enabled.enabled) {
1484       if (ompt_enabled.ompt_callback_parallel_begin) {
1485         int team_size = master_set_numthreads
1486                             ? master_set_numthreads
1487                             : get__nproc_2(parent_team, master_tid);
1488         int flags = OMPT_INVOKER(call_context) |
1489                     ((microtask == (microtask_t)__kmp_teams_master)
1490                          ? ompt_parallel_league
1491                          : ompt_parallel_team);
1492         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1493             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1494             return_address);
1495       }
1496       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1497     }
1498 #endif
1499 
1500     master_th->th.th_ident = loc;
1501 
1502     if (master_th->th.th_teams_microtask && ap &&
1503         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1504       // AC: This is start of parallel that is nested inside teams construct.
1505       // The team is actual (hot), all workers are ready at the fork barrier.
1506       // No lock needed to initialize the team a bit, then free workers.
1507       parent_team->t.t_ident = loc;
1508       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1509       parent_team->t.t_argc = argc;
1510       argv = (void **)parent_team->t.t_argv;
1511       for (i = argc - 1; i >= 0; --i)
1512         *argv++ = va_arg(kmp_va_deref(ap), void *);
1513       // Increment our nested depth levels, but not increase the serialization
1514       if (parent_team == master_th->th.th_serial_team) {
1515         // AC: we are in serialized parallel
1516         __kmpc_serialized_parallel(loc, gtid);
1517         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1518 
1519         if (call_context == fork_context_gnu) {
1520           // AC: need to decrement t_serialized for enquiry functions to work
1521           // correctly, will restore at join time
1522           parent_team->t.t_serialized--;
1523           return TRUE;
1524         }
1525 
1526 #if OMPT_SUPPORT
1527         void *dummy;
1528         void **exit_frame_p;
1529 
1530         ompt_lw_taskteam_t lw_taskteam;
1531 
1532         if (ompt_enabled.enabled) {
1533           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1534                                   &ompt_parallel_data, return_address);
1535           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1536 
1537           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1538           // don't use lw_taskteam after linking. content was swaped
1539 
1540           /* OMPT implicit task begin */
1541           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1542           if (ompt_enabled.ompt_callback_implicit_task) {
1543             OMPT_CUR_TASK_INFO(master_th)
1544                 ->thread_num = __kmp_tid_from_gtid(gtid);
1545             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1546                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1547                 implicit_task_data, 1,
1548                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1549           }
1550 
1551           /* OMPT state */
1552           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1553         } else {
1554           exit_frame_p = &dummy;
1555         }
1556 #endif
1557         // AC: need to decrement t_serialized for enquiry functions to work
1558         // correctly, will restore at join time
1559         parent_team->t.t_serialized--;
1560 
1561         {
1562           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1563           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1564           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1565 #if OMPT_SUPPORT
1566                                  ,
1567                                  exit_frame_p
1568 #endif
1569                                  );
1570         }
1571 
1572 #if OMPT_SUPPORT
1573         if (ompt_enabled.enabled) {
1574           *exit_frame_p = NULL;
1575           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1576           if (ompt_enabled.ompt_callback_implicit_task) {
1577             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1578                 ompt_scope_end, NULL, implicit_task_data, 1,
1579                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1580           }
1581           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1582           __ompt_lw_taskteam_unlink(master_th);
1583           if (ompt_enabled.ompt_callback_parallel_end) {
1584             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1585                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1586                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1587                 return_address);
1588           }
1589           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1590         }
1591 #endif
1592         return TRUE;
1593       }
1594 
1595       parent_team->t.t_pkfn = microtask;
1596       parent_team->t.t_invoke = invoker;
1597       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1598       parent_team->t.t_active_level++;
1599       parent_team->t.t_level++;
1600       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1601 
1602 #if OMPT_SUPPORT
1603       if (ompt_enabled.enabled) {
1604         ompt_lw_taskteam_t lw_taskteam;
1605         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1606                                 &ompt_parallel_data, return_address);
1607         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1608       }
1609 #endif
1610 
1611       /* Change number of threads in the team if requested */
1612       if (master_set_numthreads) { // The parallel has num_threads clause
1613         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1614           // AC: only can reduce number of threads dynamically, can't increase
1615           kmp_info_t **other_threads = parent_team->t.t_threads;
1616           parent_team->t.t_nproc = master_set_numthreads;
1617           for (i = 0; i < master_set_numthreads; ++i) {
1618             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1619           }
1620           // Keep extra threads hot in the team for possible next parallels
1621         }
1622         master_th->th.th_set_nproc = 0;
1623       }
1624 
1625 #if USE_DEBUGGER
1626       if (__kmp_debugging) { // Let debugger override number of threads.
1627         int nth = __kmp_omp_num_threads(loc);
1628         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1629           master_set_numthreads = nth;
1630         }
1631       }
1632 #endif
1633 
1634 #if USE_ITT_BUILD
1635       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1636            KMP_ITT_DEBUG) &&
1637           __kmp_forkjoin_frames_mode == 3 &&
1638           parent_team->t.t_active_level == 1 // only report frames at level 1
1639           && master_th->th.th_teams_size.nteams == 1) {
1640         kmp_uint64 tmp_time = __itt_get_timestamp();
1641         master_th->th.th_frame_time = tmp_time;
1642         parent_team->t.t_region_time = tmp_time;
1643       }
1644       if (__itt_stack_caller_create_ptr) {
1645         // create new stack stitching id before entering fork barrier
1646         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1647       }
1648 #endif /* USE_ITT_BUILD */
1649 
1650       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1651                     "master_th=%p, gtid=%d\n",
1652                     root, parent_team, master_th, gtid));
1653       __kmp_internal_fork(loc, gtid, parent_team);
1654       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1655                     "master_th=%p, gtid=%d\n",
1656                     root, parent_team, master_th, gtid));
1657 
1658       if (call_context == fork_context_gnu)
1659         return TRUE;
1660 
1661       /* Invoke microtask for MASTER thread */
1662       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1663                     parent_team->t.t_id, parent_team->t.t_pkfn));
1664 
1665       if (!parent_team->t.t_invoke(gtid)) {
1666         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1667       }
1668       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1669                     parent_team->t.t_id, parent_team->t.t_pkfn));
1670       KMP_MB(); /* Flush all pending memory write invalidates.  */
1671 
1672       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1673 
1674       return TRUE;
1675     } // Parallel closely nested in teams construct
1676 
1677 #if KMP_DEBUG
1678     if (__kmp_tasking_mode != tskm_immediate_exec) {
1679       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1680                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1681     }
1682 #endif
1683 
1684     if (parent_team->t.t_active_level >=
1685         master_th->th.th_current_task->td_icvs.max_active_levels) {
1686       nthreads = 1;
1687     } else {
1688       int enter_teams = ((ap == NULL && active_level == 0) ||
1689                          (ap && teams_level > 0 && teams_level == level));
1690       nthreads =
1691           master_set_numthreads
1692               ? master_set_numthreads
1693               : get__nproc_2(
1694                     parent_team,
1695                     master_tid); // TODO: get nproc directly from current task
1696 
1697       // Check if we need to take forkjoin lock? (no need for serialized
1698       // parallel out of teams construct). This code moved here from
1699       // __kmp_reserve_threads() to speedup nested serialized parallels.
1700       if (nthreads > 1) {
1701         if ((get__max_active_levels(master_th) == 1 &&
1702              (root->r.r_in_parallel && !enter_teams)) ||
1703             (__kmp_library == library_serial)) {
1704           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1705                         " threads\n",
1706                         gtid, nthreads));
1707           nthreads = 1;
1708         }
1709       }
1710       if (nthreads > 1) {
1711         /* determine how many new threads we can use */
1712         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1713         /* AC: If we execute teams from parallel region (on host), then teams
1714            should be created but each can only have 1 thread if nesting is
1715            disabled. If teams called from serial region, then teams and their
1716            threads should be created regardless of the nesting setting. */
1717         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1718                                          nthreads, enter_teams);
1719         if (nthreads == 1) {
1720           // Free lock for single thread execution here; for multi-thread
1721           // execution it will be freed later after team of threads created
1722           // and initialized
1723           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1724         }
1725       }
1726     }
1727     KMP_DEBUG_ASSERT(nthreads > 0);
1728 
1729     // If we temporarily changed the set number of threads then restore it now
1730     master_th->th.th_set_nproc = 0;
1731 
1732     /* create a serialized parallel region? */
1733     if (nthreads == 1) {
1734 /* josh todo: hypothetical question: what do we do for OS X*? */
1735 #if KMP_OS_LINUX &&                                                            \
1736     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1737       void *args[argc];
1738 #else
1739       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1740 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1741           KMP_ARCH_AARCH64) */
1742 
1743       KA_TRACE(20,
1744                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1745 
1746       __kmpc_serialized_parallel(loc, gtid);
1747 
1748       if (call_context == fork_context_intel) {
1749         /* TODO this sucks, use the compiler itself to pass args! :) */
1750         master_th->th.th_serial_team->t.t_ident = loc;
1751         if (!ap) {
1752           // revert change made in __kmpc_serialized_parallel()
1753           master_th->th.th_serial_team->t.t_level--;
1754 // Get args from parent team for teams construct
1755 
1756 #if OMPT_SUPPORT
1757           void *dummy;
1758           void **exit_frame_p;
1759           ompt_task_info_t *task_info;
1760 
1761           ompt_lw_taskteam_t lw_taskteam;
1762 
1763           if (ompt_enabled.enabled) {
1764             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1765                                     &ompt_parallel_data, return_address);
1766 
1767             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1768             // don't use lw_taskteam after linking. content was swaped
1769 
1770             task_info = OMPT_CUR_TASK_INFO(master_th);
1771             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1772             if (ompt_enabled.ompt_callback_implicit_task) {
1773               OMPT_CUR_TASK_INFO(master_th)
1774                   ->thread_num = __kmp_tid_from_gtid(gtid);
1775               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1776                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1777                   &(task_info->task_data), 1,
1778                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1779                   ompt_task_implicit);
1780             }
1781 
1782             /* OMPT state */
1783             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1784           } else {
1785             exit_frame_p = &dummy;
1786           }
1787 #endif
1788 
1789           {
1790             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1791             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1792             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1793                                    parent_team->t.t_argv
1794 #if OMPT_SUPPORT
1795                                    ,
1796                                    exit_frame_p
1797 #endif
1798                                    );
1799           }
1800 
1801 #if OMPT_SUPPORT
1802           if (ompt_enabled.enabled) {
1803             *exit_frame_p = NULL;
1804             if (ompt_enabled.ompt_callback_implicit_task) {
1805               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1806                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1807                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1808                   ompt_task_implicit);
1809             }
1810             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1811             __ompt_lw_taskteam_unlink(master_th);
1812             if (ompt_enabled.ompt_callback_parallel_end) {
1813               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1814                   &ompt_parallel_data, parent_task_data,
1815                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1816                   return_address);
1817             }
1818             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1819           }
1820 #endif
1821         } else if (microtask == (microtask_t)__kmp_teams_master) {
1822           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1823                            master_th->th.th_serial_team);
1824           team = master_th->th.th_team;
1825           // team->t.t_pkfn = microtask;
1826           team->t.t_invoke = invoker;
1827           __kmp_alloc_argv_entries(argc, team, TRUE);
1828           team->t.t_argc = argc;
1829           argv = (void **)team->t.t_argv;
1830           if (ap) {
1831             for (i = argc - 1; i >= 0; --i)
1832               *argv++ = va_arg(kmp_va_deref(ap), void *);
1833           } else {
1834             for (i = 0; i < argc; ++i)
1835               // Get args from parent team for teams construct
1836               argv[i] = parent_team->t.t_argv[i];
1837           }
1838           // AC: revert change made in __kmpc_serialized_parallel()
1839           //     because initial code in teams should have level=0
1840           team->t.t_level--;
1841           // AC: call special invoker for outer "parallel" of teams construct
1842           invoker(gtid);
1843 #if OMPT_SUPPORT
1844           if (ompt_enabled.enabled) {
1845             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1846             if (ompt_enabled.ompt_callback_implicit_task) {
1847               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1848                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1849                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1850             }
1851             if (ompt_enabled.ompt_callback_parallel_end) {
1852               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1853                   &ompt_parallel_data, parent_task_data,
1854                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1855                   return_address);
1856             }
1857             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1858           }
1859 #endif
1860         } else {
1861           argv = args;
1862           for (i = argc - 1; i >= 0; --i)
1863             *argv++ = va_arg(kmp_va_deref(ap), void *);
1864           KMP_MB();
1865 
1866 #if OMPT_SUPPORT
1867           void *dummy;
1868           void **exit_frame_p;
1869           ompt_task_info_t *task_info;
1870 
1871           ompt_lw_taskteam_t lw_taskteam;
1872 
1873           if (ompt_enabled.enabled) {
1874             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1875                                     &ompt_parallel_data, return_address);
1876             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1877             // don't use lw_taskteam after linking. content was swaped
1878             task_info = OMPT_CUR_TASK_INFO(master_th);
1879             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1880 
1881             /* OMPT implicit task begin */
1882             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1883             if (ompt_enabled.ompt_callback_implicit_task) {
1884               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1885                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1886                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1887                   ompt_task_implicit);
1888               OMPT_CUR_TASK_INFO(master_th)
1889                   ->thread_num = __kmp_tid_from_gtid(gtid);
1890             }
1891 
1892             /* OMPT state */
1893             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1894           } else {
1895             exit_frame_p = &dummy;
1896           }
1897 #endif
1898 
1899           {
1900             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1901             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1902             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1903 #if OMPT_SUPPORT
1904                                    ,
1905                                    exit_frame_p
1906 #endif
1907                                    );
1908           }
1909 
1910 #if OMPT_SUPPORT
1911           if (ompt_enabled.enabled) {
1912             *exit_frame_p = NULL;
1913             if (ompt_enabled.ompt_callback_implicit_task) {
1914               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1915                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1916                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1917                   ompt_task_implicit);
1918             }
1919 
1920             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1921             __ompt_lw_taskteam_unlink(master_th);
1922             if (ompt_enabled.ompt_callback_parallel_end) {
1923               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1924                   &ompt_parallel_data, parent_task_data,
1925                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1926                   return_address);
1927             }
1928             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1929           }
1930 #endif
1931         }
1932       } else if (call_context == fork_context_gnu) {
1933 #if OMPT_SUPPORT
1934         ompt_lw_taskteam_t lwt;
1935         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1936                                 return_address);
1937 
1938         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1939         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1940 // don't use lw_taskteam after linking. content was swaped
1941 #endif
1942 
1943         // we were called from GNU native code
1944         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1945         return FALSE;
1946       } else {
1947         KMP_ASSERT2(call_context < fork_context_last,
1948                     "__kmp_fork_call: unknown fork_context parameter");
1949       }
1950 
1951       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1952       KMP_MB();
1953       return FALSE;
1954     } // if (nthreads == 1)
1955 
1956     // GEH: only modify the executing flag in the case when not serialized
1957     //      serialized case is handled in kmpc_serialized_parallel
1958     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1959                   "curtask=%p, curtask_max_aclevel=%d\n",
1960                   parent_team->t.t_active_level, master_th,
1961                   master_th->th.th_current_task,
1962                   master_th->th.th_current_task->td_icvs.max_active_levels));
1963     // TODO: GEH - cannot do this assertion because root thread not set up as
1964     // executing
1965     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1966     master_th->th.th_current_task->td_flags.executing = 0;
1967 
1968     if (!master_th->th.th_teams_microtask || level > teams_level) {
1969       /* Increment our nested depth level */
1970       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1971     }
1972 
1973     // See if we need to make a copy of the ICVs.
1974     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1975     if ((level + 1 < __kmp_nested_nth.used) &&
1976         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1977       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1978     } else {
1979       nthreads_icv = 0; // don't update
1980     }
1981 
1982     // Figure out the proc_bind_policy for the new team.
1983     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1984     kmp_proc_bind_t proc_bind_icv =
1985         proc_bind_default; // proc_bind_default means don't update
1986     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1987       proc_bind = proc_bind_false;
1988     } else {
1989       if (proc_bind == proc_bind_default) {
1990         // No proc_bind clause specified; use current proc-bind-var for this
1991         // parallel region
1992         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1993       }
1994       /* else: The proc_bind policy was specified explicitly on parallel clause.
1995          This overrides proc-bind-var for this parallel region, but does not
1996          change proc-bind-var. */
1997       // Figure the value of proc-bind-var for the child threads.
1998       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1999           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2000            master_th->th.th_current_task->td_icvs.proc_bind)) {
2001         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2002       }
2003     }
2004 
2005     // Reset for next parallel region
2006     master_th->th.th_set_proc_bind = proc_bind_default;
2007 
2008     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2009       kmp_internal_control_t new_icvs;
2010       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2011       new_icvs.next = NULL;
2012       if (nthreads_icv > 0) {
2013         new_icvs.nproc = nthreads_icv;
2014       }
2015       if (proc_bind_icv != proc_bind_default) {
2016         new_icvs.proc_bind = proc_bind_icv;
2017       }
2018 
2019       /* allocate a new parallel team */
2020       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2021       team = __kmp_allocate_team(root, nthreads, nthreads,
2022 #if OMPT_SUPPORT
2023                                  ompt_parallel_data,
2024 #endif
2025                                  proc_bind, &new_icvs,
2026                                  argc USE_NESTED_HOT_ARG(master_th));
2027     } else {
2028       /* allocate a new parallel team */
2029       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2030       team = __kmp_allocate_team(root, nthreads, nthreads,
2031 #if OMPT_SUPPORT
2032                                  ompt_parallel_data,
2033 #endif
2034                                  proc_bind,
2035                                  &master_th->th.th_current_task->td_icvs,
2036                                  argc USE_NESTED_HOT_ARG(master_th));
2037     }
2038     KF_TRACE(
2039         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2040 
2041     /* setup the new team */
2042     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2043     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2044     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2045     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2046     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2047 #if OMPT_SUPPORT
2048     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2049                           return_address);
2050 #endif
2051     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2052     // TODO: parent_team->t.t_level == INT_MAX ???
2053     if (!master_th->th.th_teams_microtask || level > teams_level) {
2054       int new_level = parent_team->t.t_level + 1;
2055       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2056       new_level = parent_team->t.t_active_level + 1;
2057       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2058     } else {
2059       // AC: Do not increase parallel level at start of the teams construct
2060       int new_level = parent_team->t.t_level;
2061       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2062       new_level = parent_team->t.t_active_level;
2063       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2064     }
2065     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2066     // set master's schedule as new run-time schedule
2067     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2068 
2069     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2070     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2071 
2072     // Update the floating point rounding in the team if required.
2073     propagateFPControl(team);
2074 
2075     if (__kmp_tasking_mode != tskm_immediate_exec) {
2076       // Set master's task team to team's task team. Unless this is hot team, it
2077       // should be NULL.
2078       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2079                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2080       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2081                     "%p, new task_team %p / team %p\n",
2082                     __kmp_gtid_from_thread(master_th),
2083                     master_th->th.th_task_team, parent_team,
2084                     team->t.t_task_team[master_th->th.th_task_state], team));
2085 
2086       if (active_level || master_th->th.th_task_team) {
2087         // Take a memo of master's task_state
2088         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2089         if (master_th->th.th_task_state_top >=
2090             master_th->th.th_task_state_stack_sz) { // increase size
2091           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2092           kmp_uint8 *old_stack, *new_stack;
2093           kmp_uint32 i;
2094           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2095           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2096             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2097           }
2098           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2099                ++i) { // zero-init rest of stack
2100             new_stack[i] = 0;
2101           }
2102           old_stack = master_th->th.th_task_state_memo_stack;
2103           master_th->th.th_task_state_memo_stack = new_stack;
2104           master_th->th.th_task_state_stack_sz = new_size;
2105           __kmp_free(old_stack);
2106         }
2107         // Store master's task_state on stack
2108         master_th->th
2109             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2110             master_th->th.th_task_state;
2111         master_th->th.th_task_state_top++;
2112 #if KMP_NESTED_HOT_TEAMS
2113         if (master_th->th.th_hot_teams &&
2114             active_level < __kmp_hot_teams_max_level &&
2115             team == master_th->th.th_hot_teams[active_level].hot_team) {
2116           // Restore master's nested state if nested hot team
2117           master_th->th.th_task_state =
2118               master_th->th
2119                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2120         } else {
2121 #endif
2122           master_th->th.th_task_state = 0;
2123 #if KMP_NESTED_HOT_TEAMS
2124         }
2125 #endif
2126       }
2127 #if !KMP_NESTED_HOT_TEAMS
2128       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2129                        (team == root->r.r_hot_team));
2130 #endif
2131     }
2132 
2133     KA_TRACE(
2134         20,
2135         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2136          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2137          team->t.t_nproc));
2138     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2139                      (team->t.t_master_tid == 0 &&
2140                       (team->t.t_parent == root->r.r_root_team ||
2141                        team->t.t_parent->t.t_serialized)));
2142     KMP_MB();
2143 
2144     /* now, setup the arguments */
2145     argv = (void **)team->t.t_argv;
2146     if (ap) {
2147       for (i = argc - 1; i >= 0; --i) {
2148         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2149         KMP_CHECK_UPDATE(*argv, new_argv);
2150         argv++;
2151       }
2152     } else {
2153       for (i = 0; i < argc; ++i) {
2154         // Get args from parent team for teams construct
2155         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2156       }
2157     }
2158 
2159     /* now actually fork the threads */
2160     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2161     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2162       root->r.r_active = TRUE;
2163 
2164     __kmp_fork_team_threads(root, team, master_th, gtid);
2165     __kmp_setup_icv_copy(team, nthreads,
2166                          &master_th->th.th_current_task->td_icvs, loc);
2167 
2168 #if OMPT_SUPPORT
2169     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2170 #endif
2171 
2172     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2173 
2174 #if USE_ITT_BUILD
2175     if (team->t.t_active_level == 1 // only report frames at level 1
2176         && !master_th->th.th_teams_microtask) { // not in teams construct
2177 #if USE_ITT_NOTIFY
2178       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2179           (__kmp_forkjoin_frames_mode == 3 ||
2180            __kmp_forkjoin_frames_mode == 1)) {
2181         kmp_uint64 tmp_time = 0;
2182         if (__itt_get_timestamp_ptr)
2183           tmp_time = __itt_get_timestamp();
2184         // Internal fork - report frame begin
2185         master_th->th.th_frame_time = tmp_time;
2186         if (__kmp_forkjoin_frames_mode == 3)
2187           team->t.t_region_time = tmp_time;
2188       } else
2189 // only one notification scheme (either "submit" or "forking/joined", not both)
2190 #endif /* USE_ITT_NOTIFY */
2191           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2192               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2193         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2194         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2195       }
2196     }
2197 #endif /* USE_ITT_BUILD */
2198 
2199     /* now go on and do the work */
2200     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2201     KMP_MB();
2202     KF_TRACE(10,
2203              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2204               root, team, master_th, gtid));
2205 
2206 #if USE_ITT_BUILD
2207     if (__itt_stack_caller_create_ptr) {
2208       team->t.t_stack_id =
2209           __kmp_itt_stack_caller_create(); // create new stack stitching id
2210       // before entering fork barrier
2211     }
2212 #endif /* USE_ITT_BUILD */
2213 
2214     // AC: skip __kmp_internal_fork at teams construct, let only master
2215     // threads execute
2216     if (ap) {
2217       __kmp_internal_fork(loc, gtid, team);
2218       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2219                     "master_th=%p, gtid=%d\n",
2220                     root, team, master_th, gtid));
2221     }
2222 
2223     if (call_context == fork_context_gnu) {
2224       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2225       return TRUE;
2226     }
2227 
2228     /* Invoke microtask for MASTER thread */
2229     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2230                   team->t.t_id, team->t.t_pkfn));
2231   } // END of timer KMP_fork_call block
2232 
2233 #if KMP_STATS_ENABLED
2234   // If beginning a teams construct, then change thread state
2235   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2236   if (!ap) {
2237     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2238   }
2239 #endif
2240 
2241   if (!team->t.t_invoke(gtid)) {
2242     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2243   }
2244 
2245 #if KMP_STATS_ENABLED
2246   // If was beginning of a teams construct, then reset thread state
2247   if (!ap) {
2248     KMP_SET_THREAD_STATE(previous_state);
2249   }
2250 #endif
2251 
2252   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253                 team->t.t_id, team->t.t_pkfn));
2254   KMP_MB(); /* Flush all pending memory write invalidates.  */
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259   if (ompt_enabled.enabled) {
2260     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2261   }
2262 #endif
2263 
2264   return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269                                             kmp_team_t *team) {
2270   // restore state outside the region
2271   thread->th.ompt_thread_info.state =
2272       ((team->t.t_serialized) ? ompt_state_work_serial
2273                               : ompt_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277                                    kmp_team_t *team, ompt_data_t *parallel_data,
2278                                    int flags, void *codeptr) {
2279   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280   if (ompt_enabled.ompt_callback_parallel_end) {
2281     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282         parallel_data, &(task_info->task_data), flags, codeptr);
2283   }
2284 
2285   task_info->frame.enter_frame = ompt_data_none;
2286   __kmp_join_restore_state(thread, team);
2287 }
2288 #endif
2289 
2290 void __kmp_join_call(ident_t *loc, int gtid
2291 #if OMPT_SUPPORT
2292                      ,
2293                      enum fork_context_e fork_context
2294 #endif
2295                      ,
2296                      int exit_teams) {
2297   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2298   kmp_team_t *team;
2299   kmp_team_t *parent_team;
2300   kmp_info_t *master_th;
2301   kmp_root_t *root;
2302   int master_active;
2303 
2304   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2305 
2306   /* setup current data */
2307   master_th = __kmp_threads[gtid];
2308   root = master_th->th.th_root;
2309   team = master_th->th.th_team;
2310   parent_team = team->t.t_parent;
2311 
2312   master_th->th.th_ident = loc;
2313 
2314 #if OMPT_SUPPORT
2315   void *team_microtask = (void *)team->t.t_pkfn;
2316   // For GOMP interface with serialized parallel, need the
2317   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2318   // and end-parallel events.
2319   if (ompt_enabled.enabled &&
2320       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2321     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2322   }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328                   "th_task_team = %p\n",
2329                   __kmp_gtid_from_thread(master_th), team,
2330                   team->t.t_task_team[master_th->th.th_task_state],
2331                   master_th->th.th_task_team));
2332     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333                      team->t.t_task_team[master_th->th.th_task_state]);
2334   }
2335 #endif
2336 
2337   if (team->t.t_serialized) {
2338     if (master_th->th.th_teams_microtask) {
2339       // We are in teams construct
2340       int level = team->t.t_level;
2341       int tlevel = master_th->th.th_teams_level;
2342       if (level == tlevel) {
2343         // AC: we haven't incremented it earlier at start of teams construct,
2344         //     so do it here - at the end of teams construct
2345         team->t.t_level++;
2346       } else if (level == tlevel + 1) {
2347         // AC: we are exiting parallel inside teams, need to increment
2348         // serialization in order to restore it in the next call to
2349         // __kmpc_end_serialized_parallel
2350         team->t.t_serialized++;
2351       }
2352     }
2353     __kmpc_end_serialized_parallel(loc, gtid);
2354 
2355 #if OMPT_SUPPORT
2356     if (ompt_enabled.enabled) {
2357       __kmp_join_restore_state(master_th, parent_team);
2358     }
2359 #endif
2360 
2361     return;
2362   }
2363 
2364   master_active = team->t.t_master_active;
2365 
2366   if (!exit_teams) {
2367     // AC: No barrier for internal teams at exit from teams construct.
2368     //     But there is barrier for external team (league).
2369     __kmp_internal_join(loc, gtid, team);
2370   } else {
2371     master_th->th.th_task_state =
2372         0; // AC: no tasking in teams (out of any parallel)
2373   }
2374 
2375   KMP_MB();
2376 
2377 #if OMPT_SUPPORT
2378   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2379   void *codeptr = team->t.ompt_team_info.master_return_address;
2380 #endif
2381 
2382 #if USE_ITT_BUILD
2383   if (__itt_stack_caller_create_ptr) {
2384     // destroy the stack stitching id after join barrier
2385     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2386   }
2387   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2388   if (team->t.t_active_level == 1 &&
2389       (!master_th->th.th_teams_microtask || /* not in teams construct */
2390        master_th->th.th_teams_size.nteams == 1)) {
2391     master_th->th.th_ident = loc;
2392     // only one notification scheme (either "submit" or "forking/joined", not
2393     // both)
2394     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2395         __kmp_forkjoin_frames_mode == 3)
2396       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2397                              master_th->th.th_frame_time, 0, loc,
2398                              master_th->th.th_team_nproc, 1);
2399     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2400              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2401       __kmp_itt_region_joined(gtid);
2402   } // active_level == 1
2403 #endif /* USE_ITT_BUILD */
2404 
2405   if (master_th->th.th_teams_microtask && !exit_teams &&
2406       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2407       team->t.t_level == master_th->th.th_teams_level + 1) {
2408 // AC: We need to leave the team structure intact at the end of parallel
2409 // inside the teams construct, so that at the next parallel same (hot) team
2410 // works, only adjust nesting levels
2411 #if OMPT_SUPPORT
2412     ompt_data_t ompt_parallel_data = ompt_data_none;
2413     if (ompt_enabled.enabled) {
2414       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2415       if (ompt_enabled.ompt_callback_implicit_task) {
2416         int ompt_team_size = team->t.t_nproc;
2417         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2418             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2419             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2420       }
2421       task_info->frame.exit_frame = ompt_data_none;
2422       task_info->task_data = ompt_data_none;
2423       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2424       __ompt_lw_taskteam_unlink(master_th);
2425     }
2426 #endif
2427     /* Decrement our nested depth level */
2428     team->t.t_level--;
2429     team->t.t_active_level--;
2430     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2431 
2432     // Restore number of threads in the team if needed. This code relies on
2433     // the proper adjustment of th_teams_size.nth after the fork in
2434     // __kmp_teams_master on each teams master in the case that
2435     // __kmp_reserve_threads reduced it.
2436     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2437       int old_num = master_th->th.th_team_nproc;
2438       int new_num = master_th->th.th_teams_size.nth;
2439       kmp_info_t **other_threads = team->t.t_threads;
2440       team->t.t_nproc = new_num;
2441       for (int i = 0; i < old_num; ++i) {
2442         other_threads[i]->th.th_team_nproc = new_num;
2443       }
2444       // Adjust states of non-used threads of the team
2445       for (int i = old_num; i < new_num; ++i) {
2446         // Re-initialize thread's barrier data.
2447         KMP_DEBUG_ASSERT(other_threads[i]);
2448         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2449         for (int b = 0; b < bs_last_barrier; ++b) {
2450           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2451           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2452 #if USE_DEBUGGER
2453           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2454 #endif
2455         }
2456         if (__kmp_tasking_mode != tskm_immediate_exec) {
2457           // Synchronize thread's task state
2458           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2459         }
2460       }
2461     }
2462 
2463 #if OMPT_SUPPORT
2464     if (ompt_enabled.enabled) {
2465       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2466                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2467     }
2468 #endif
2469 
2470     return;
2471   }
2472 
2473   /* do cleanup and restore the parent team */
2474   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2475   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2476 
2477   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2478 
2479   /* jc: The following lock has instructions with REL and ACQ semantics,
2480      separating the parallel user code called in this parallel region
2481      from the serial user code called after this function returns. */
2482   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2483 
2484   if (!master_th->th.th_teams_microtask ||
2485       team->t.t_level > master_th->th.th_teams_level) {
2486     /* Decrement our nested depth level */
2487     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2488   }
2489   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2490 
2491 #if OMPT_SUPPORT
2492   if (ompt_enabled.enabled) {
2493     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2494     if (ompt_enabled.ompt_callback_implicit_task) {
2495       int flags = (team_microtask == (void *)__kmp_teams_master)
2496                       ? ompt_task_initial
2497                       : ompt_task_implicit;
2498       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2499       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2500           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2501           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2502     }
2503     task_info->frame.exit_frame = ompt_data_none;
2504     task_info->task_data = ompt_data_none;
2505   }
2506 #endif
2507 
2508   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2509                 master_th, team));
2510   __kmp_pop_current_task_from_thread(master_th);
2511 
2512 #if KMP_AFFINITY_SUPPORTED
2513   // Restore master thread's partition.
2514   master_th->th.th_first_place = team->t.t_first_place;
2515   master_th->th.th_last_place = team->t.t_last_place;
2516 #endif // KMP_AFFINITY_SUPPORTED
2517   master_th->th.th_def_allocator = team->t.t_def_allocator;
2518 
2519   updateHWFPControl(team);
2520 
2521   if (root->r.r_active != master_active)
2522     root->r.r_active = master_active;
2523 
2524   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2525                             master_th)); // this will free worker threads
2526 
2527   /* this race was fun to find. make sure the following is in the critical
2528      region otherwise assertions may fail occasionally since the old team may be
2529      reallocated and the hierarchy appears inconsistent. it is actually safe to
2530      run and won't cause any bugs, but will cause those assertion failures. it's
2531      only one deref&assign so might as well put this in the critical region */
2532   master_th->th.th_team = parent_team;
2533   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2534   master_th->th.th_team_master = parent_team->t.t_threads[0];
2535   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2536 
2537   /* restore serialized team, if need be */
2538   if (parent_team->t.t_serialized &&
2539       parent_team != master_th->th.th_serial_team &&
2540       parent_team != root->r.r_root_team) {
2541     __kmp_free_team(root,
2542                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2543     master_th->th.th_serial_team = parent_team;
2544   }
2545 
2546   if (__kmp_tasking_mode != tskm_immediate_exec) {
2547     if (master_th->th.th_task_state_top >
2548         0) { // Restore task state from memo stack
2549       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2550       // Remember master's state if we re-use this nested hot team
2551       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2552           master_th->th.th_task_state;
2553       --master_th->th.th_task_state_top; // pop
2554       // Now restore state at this level
2555       master_th->th.th_task_state =
2556           master_th->th
2557               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2558     }
2559     // Copy the task team from the parent team to the master thread
2560     master_th->th.th_task_team =
2561         parent_team->t.t_task_team[master_th->th.th_task_state];
2562     KA_TRACE(20,
2563              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2564               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2565               parent_team));
2566   }
2567 
2568   // TODO: GEH - cannot do this assertion because root thread not set up as
2569   // executing
2570   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2571   master_th->th.th_current_task->td_flags.executing = 1;
2572 
2573   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2574 
2575 #if OMPT_SUPPORT
2576   int flags =
2577       OMPT_INVOKER(fork_context) |
2578       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2579                                                       : ompt_parallel_team);
2580   if (ompt_enabled.enabled) {
2581     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2582                     codeptr);
2583   }
2584 #endif
2585 
2586   KMP_MB();
2587   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2588 }
2589 
2590 /* Check whether we should push an internal control record onto the
2591    serial team stack.  If so, do it.  */
2592 void __kmp_save_internal_controls(kmp_info_t *thread) {
2593 
2594   if (thread->th.th_team != thread->th.th_serial_team) {
2595     return;
2596   }
2597   if (thread->th.th_team->t.t_serialized > 1) {
2598     int push = 0;
2599 
2600     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2601       push = 1;
2602     } else {
2603       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2604           thread->th.th_team->t.t_serialized) {
2605         push = 1;
2606       }
2607     }
2608     if (push) { /* push a record on the serial team's stack */
2609       kmp_internal_control_t *control =
2610           (kmp_internal_control_t *)__kmp_allocate(
2611               sizeof(kmp_internal_control_t));
2612 
2613       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2614 
2615       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2616 
2617       control->next = thread->th.th_team->t.t_control_stack_top;
2618       thread->th.th_team->t.t_control_stack_top = control;
2619     }
2620   }
2621 }
2622 
2623 /* Changes set_nproc */
2624 void __kmp_set_num_threads(int new_nth, int gtid) {
2625   kmp_info_t *thread;
2626   kmp_root_t *root;
2627 
2628   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2629   KMP_DEBUG_ASSERT(__kmp_init_serial);
2630 
2631   if (new_nth < 1)
2632     new_nth = 1;
2633   else if (new_nth > __kmp_max_nth)
2634     new_nth = __kmp_max_nth;
2635 
2636   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2637   thread = __kmp_threads[gtid];
2638   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2639     return; // nothing to do
2640 
2641   __kmp_save_internal_controls(thread);
2642 
2643   set__nproc(thread, new_nth);
2644 
2645   // If this omp_set_num_threads() call will cause the hot team size to be
2646   // reduced (in the absence of a num_threads clause), then reduce it now,
2647   // rather than waiting for the next parallel region.
2648   root = thread->th.th_root;
2649   if (__kmp_init_parallel && (!root->r.r_active) &&
2650       (root->r.r_hot_team->t.t_nproc > new_nth)
2651 #if KMP_NESTED_HOT_TEAMS
2652       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2653 #endif
2654       ) {
2655     kmp_team_t *hot_team = root->r.r_hot_team;
2656     int f;
2657 
2658     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2659 
2660     // Release the extra threads we don't need any more.
2661     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2662       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2663       if (__kmp_tasking_mode != tskm_immediate_exec) {
2664         // When decreasing team size, threads no longer in the team should unref
2665         // task team.
2666         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2667       }
2668       __kmp_free_thread(hot_team->t.t_threads[f]);
2669       hot_team->t.t_threads[f] = NULL;
2670     }
2671     hot_team->t.t_nproc = new_nth;
2672 #if KMP_NESTED_HOT_TEAMS
2673     if (thread->th.th_hot_teams) {
2674       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2675       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2676     }
2677 #endif
2678 
2679     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2680 
2681     // Update the t_nproc field in the threads that are still active.
2682     for (f = 0; f < new_nth; f++) {
2683       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2684       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2685     }
2686     // Special flag in case omp_set_num_threads() call
2687     hot_team->t.t_size_changed = -1;
2688   }
2689 }
2690 
2691 /* Changes max_active_levels */
2692 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2693   kmp_info_t *thread;
2694 
2695   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2696                 "%d = (%d)\n",
2697                 gtid, max_active_levels));
2698   KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700   // validate max_active_levels
2701   if (max_active_levels < 0) {
2702     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2703     // We ignore this call if the user has specified a negative value.
2704     // The current setting won't be changed. The last valid setting will be
2705     // used. A warning will be issued (if warnings are allowed as controlled by
2706     // the KMP_WARNINGS env var).
2707     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2708                   "max_active_levels for thread %d = (%d)\n",
2709                   gtid, max_active_levels));
2710     return;
2711   }
2712   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2713     // it's OK, the max_active_levels is within the valid range: [ 0;
2714     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2715     // We allow a zero value. (implementation defined behavior)
2716   } else {
2717     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2718                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2719     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2720     // Current upper limit is MAX_INT. (implementation defined behavior)
2721     // If the input exceeds the upper limit, we correct the input to be the
2722     // upper limit. (implementation defined behavior)
2723     // Actually, the flow should never get here until we use MAX_INT limit.
2724   }
2725   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2726                 "max_active_levels for thread %d = (%d)\n",
2727                 gtid, max_active_levels));
2728 
2729   thread = __kmp_threads[gtid];
2730 
2731   __kmp_save_internal_controls(thread);
2732 
2733   set__max_active_levels(thread, max_active_levels);
2734 }
2735 
2736 /* Gets max_active_levels */
2737 int __kmp_get_max_active_levels(int gtid) {
2738   kmp_info_t *thread;
2739 
2740   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2741   KMP_DEBUG_ASSERT(__kmp_init_serial);
2742 
2743   thread = __kmp_threads[gtid];
2744   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2745   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2746                 "curtask_maxaclevel=%d\n",
2747                 gtid, thread->th.th_current_task,
2748                 thread->th.th_current_task->td_icvs.max_active_levels));
2749   return thread->th.th_current_task->td_icvs.max_active_levels;
2750 }
2751 
2752 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2753 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2754 
2755 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2756 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2757   kmp_info_t *thread;
2758   kmp_sched_t orig_kind;
2759   //    kmp_team_t *team;
2760 
2761   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2762                 gtid, (int)kind, chunk));
2763   KMP_DEBUG_ASSERT(__kmp_init_serial);
2764 
2765   // Check if the kind parameter is valid, correct if needed.
2766   // Valid parameters should fit in one of two intervals - standard or extended:
2767   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2768   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2769   orig_kind = kind;
2770   kind = __kmp_sched_without_mods(kind);
2771 
2772   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2773       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2774     // TODO: Hint needs attention in case we change the default schedule.
2775     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2776               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2777               __kmp_msg_null);
2778     kind = kmp_sched_default;
2779     chunk = 0; // ignore chunk value in case of bad kind
2780   }
2781 
2782   thread = __kmp_threads[gtid];
2783 
2784   __kmp_save_internal_controls(thread);
2785 
2786   if (kind < kmp_sched_upper_std) {
2787     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2788       // differ static chunked vs. unchunked:  chunk should be invalid to
2789       // indicate unchunked schedule (which is the default)
2790       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2791     } else {
2792       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2793           __kmp_sch_map[kind - kmp_sched_lower - 1];
2794     }
2795   } else {
2796     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2797     //    kmp_sched_lower - 2 ];
2798     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2799         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2800                       kmp_sched_lower - 2];
2801   }
2802   __kmp_sched_apply_mods_intkind(
2803       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2804   if (kind == kmp_sched_auto || chunk < 1) {
2805     // ignore parameter chunk for schedule auto
2806     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2807   } else {
2808     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2809   }
2810 }
2811 
2812 /* Gets def_sched_var ICV values */
2813 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2814   kmp_info_t *thread;
2815   enum sched_type th_type;
2816 
2817   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2818   KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820   thread = __kmp_threads[gtid];
2821 
2822   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2823   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2824   case kmp_sch_static:
2825   case kmp_sch_static_greedy:
2826   case kmp_sch_static_balanced:
2827     *kind = kmp_sched_static;
2828     __kmp_sched_apply_mods_stdkind(kind, th_type);
2829     *chunk = 0; // chunk was not set, try to show this fact via zero value
2830     return;
2831   case kmp_sch_static_chunked:
2832     *kind = kmp_sched_static;
2833     break;
2834   case kmp_sch_dynamic_chunked:
2835     *kind = kmp_sched_dynamic;
2836     break;
2837   case kmp_sch_guided_chunked:
2838   case kmp_sch_guided_iterative_chunked:
2839   case kmp_sch_guided_analytical_chunked:
2840     *kind = kmp_sched_guided;
2841     break;
2842   case kmp_sch_auto:
2843     *kind = kmp_sched_auto;
2844     break;
2845   case kmp_sch_trapezoidal:
2846     *kind = kmp_sched_trapezoidal;
2847     break;
2848 #if KMP_STATIC_STEAL_ENABLED
2849   case kmp_sch_static_steal:
2850     *kind = kmp_sched_static_steal;
2851     break;
2852 #endif
2853   default:
2854     KMP_FATAL(UnknownSchedulingType, th_type);
2855   }
2856 
2857   __kmp_sched_apply_mods_stdkind(kind, th_type);
2858   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2859 }
2860 
2861 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2862 
2863   int ii, dd;
2864   kmp_team_t *team;
2865   kmp_info_t *thr;
2866 
2867   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2868   KMP_DEBUG_ASSERT(__kmp_init_serial);
2869 
2870   // validate level
2871   if (level == 0)
2872     return 0;
2873   if (level < 0)
2874     return -1;
2875   thr = __kmp_threads[gtid];
2876   team = thr->th.th_team;
2877   ii = team->t.t_level;
2878   if (level > ii)
2879     return -1;
2880 
2881   if (thr->th.th_teams_microtask) {
2882     // AC: we are in teams region where multiple nested teams have same level
2883     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2884     if (level <=
2885         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2886       KMP_DEBUG_ASSERT(ii >= tlevel);
2887       // AC: As we need to pass by the teams league, we need to artificially
2888       // increase ii
2889       if (ii == tlevel) {
2890         ii += 2; // three teams have same level
2891       } else {
2892         ii++; // two teams have same level
2893       }
2894     }
2895   }
2896 
2897   if (ii == level)
2898     return __kmp_tid_from_gtid(gtid);
2899 
2900   dd = team->t.t_serialized;
2901   level++;
2902   while (ii > level) {
2903     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2904     }
2905     if ((team->t.t_serialized) && (!dd)) {
2906       team = team->t.t_parent;
2907       continue;
2908     }
2909     if (ii > level) {
2910       team = team->t.t_parent;
2911       dd = team->t.t_serialized;
2912       ii--;
2913     }
2914   }
2915 
2916   return (dd > 1) ? (0) : (team->t.t_master_tid);
2917 }
2918 
2919 int __kmp_get_team_size(int gtid, int level) {
2920 
2921   int ii, dd;
2922   kmp_team_t *team;
2923   kmp_info_t *thr;
2924 
2925   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2926   KMP_DEBUG_ASSERT(__kmp_init_serial);
2927 
2928   // validate level
2929   if (level == 0)
2930     return 1;
2931   if (level < 0)
2932     return -1;
2933   thr = __kmp_threads[gtid];
2934   team = thr->th.th_team;
2935   ii = team->t.t_level;
2936   if (level > ii)
2937     return -1;
2938 
2939   if (thr->th.th_teams_microtask) {
2940     // AC: we are in teams region where multiple nested teams have same level
2941     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2942     if (level <=
2943         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2944       KMP_DEBUG_ASSERT(ii >= tlevel);
2945       // AC: As we need to pass by the teams league, we need to artificially
2946       // increase ii
2947       if (ii == tlevel) {
2948         ii += 2; // three teams have same level
2949       } else {
2950         ii++; // two teams have same level
2951       }
2952     }
2953   }
2954 
2955   while (ii > level) {
2956     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2957     }
2958     if (team->t.t_serialized && (!dd)) {
2959       team = team->t.t_parent;
2960       continue;
2961     }
2962     if (ii > level) {
2963       team = team->t.t_parent;
2964       ii--;
2965     }
2966   }
2967 
2968   return team->t.t_nproc;
2969 }
2970 
2971 kmp_r_sched_t __kmp_get_schedule_global() {
2972   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2973   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2974   // independently. So one can get the updated schedule here.
2975 
2976   kmp_r_sched_t r_sched;
2977 
2978   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2979   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2980   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2981   // different roots (even in OMP 2.5)
2982   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2983   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2984   if (s == kmp_sch_static) {
2985     // replace STATIC with more detailed schedule (balanced or greedy)
2986     r_sched.r_sched_type = __kmp_static;
2987   } else if (s == kmp_sch_guided_chunked) {
2988     // replace GUIDED with more detailed schedule (iterative or analytical)
2989     r_sched.r_sched_type = __kmp_guided;
2990   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2991     r_sched.r_sched_type = __kmp_sched;
2992   }
2993   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2994 
2995   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2996     // __kmp_chunk may be wrong here (if it was not ever set)
2997     r_sched.chunk = KMP_DEFAULT_CHUNK;
2998   } else {
2999     r_sched.chunk = __kmp_chunk;
3000   }
3001 
3002   return r_sched;
3003 }
3004 
3005 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3006    at least argc number of *t_argv entries for the requested team. */
3007 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3008 
3009   KMP_DEBUG_ASSERT(team);
3010   if (!realloc || argc > team->t.t_max_argc) {
3011 
3012     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3013                    "current entries=%d\n",
3014                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3015     /* if previously allocated heap space for args, free them */
3016     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3017       __kmp_free((void *)team->t.t_argv);
3018 
3019     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3020       /* use unused space in the cache line for arguments */
3021       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3022       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3023                      "argv entries\n",
3024                      team->t.t_id, team->t.t_max_argc));
3025       team->t.t_argv = &team->t.t_inline_argv[0];
3026       if (__kmp_storage_map) {
3027         __kmp_print_storage_map_gtid(
3028             -1, &team->t.t_inline_argv[0],
3029             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3030             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3031             team->t.t_id);
3032       }
3033     } else {
3034       /* allocate space for arguments in the heap */
3035       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3036                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3037                                : 2 * argc;
3038       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3039                      "argv entries\n",
3040                      team->t.t_id, team->t.t_max_argc));
3041       team->t.t_argv =
3042           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3043       if (__kmp_storage_map) {
3044         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3045                                      &team->t.t_argv[team->t.t_max_argc],
3046                                      sizeof(void *) * team->t.t_max_argc,
3047                                      "team_%d.t_argv", team->t.t_id);
3048       }
3049     }
3050   }
3051 }
3052 
3053 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3054   int i;
3055   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3056   team->t.t_threads =
3057       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3058   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3059       sizeof(dispatch_shared_info_t) * num_disp_buff);
3060   team->t.t_dispatch =
3061       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3062   team->t.t_implicit_task_taskdata =
3063       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3064   team->t.t_max_nproc = max_nth;
3065 
3066   /* setup dispatch buffers */
3067   for (i = 0; i < num_disp_buff; ++i) {
3068     team->t.t_disp_buffer[i].buffer_index = i;
3069     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3070   }
3071 }
3072 
3073 static void __kmp_free_team_arrays(kmp_team_t *team) {
3074   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3075   int i;
3076   for (i = 0; i < team->t.t_max_nproc; ++i) {
3077     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3078       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3079       team->t.t_dispatch[i].th_disp_buffer = NULL;
3080     }
3081   }
3082 #if KMP_USE_HIER_SCHED
3083   __kmp_dispatch_free_hierarchies(team);
3084 #endif
3085   __kmp_free(team->t.t_threads);
3086   __kmp_free(team->t.t_disp_buffer);
3087   __kmp_free(team->t.t_dispatch);
3088   __kmp_free(team->t.t_implicit_task_taskdata);
3089   team->t.t_threads = NULL;
3090   team->t.t_disp_buffer = NULL;
3091   team->t.t_dispatch = NULL;
3092   team->t.t_implicit_task_taskdata = 0;
3093 }
3094 
3095 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3096   kmp_info_t **oldThreads = team->t.t_threads;
3097 
3098   __kmp_free(team->t.t_disp_buffer);
3099   __kmp_free(team->t.t_dispatch);
3100   __kmp_free(team->t.t_implicit_task_taskdata);
3101   __kmp_allocate_team_arrays(team, max_nth);
3102 
3103   KMP_MEMCPY(team->t.t_threads, oldThreads,
3104              team->t.t_nproc * sizeof(kmp_info_t *));
3105 
3106   __kmp_free(oldThreads);
3107 }
3108 
3109 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3110 
3111   kmp_r_sched_t r_sched =
3112       __kmp_get_schedule_global(); // get current state of scheduling globals
3113 
3114   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3115 
3116   kmp_internal_control_t g_icvs = {
3117     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3118     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3119     // adjustment of threads (per thread)
3120     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3121     // whether blocktime is explicitly set
3122     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3123 #if KMP_USE_MONITOR
3124     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3125 // intervals
3126 #endif
3127     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3128     // next parallel region (per thread)
3129     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3130     __kmp_cg_max_nth, // int thread_limit;
3131     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3132     // for max_active_levels
3133     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3134     // {sched,chunk} pair
3135     __kmp_nested_proc_bind.bind_types[0],
3136     __kmp_default_device,
3137     NULL // struct kmp_internal_control *next;
3138   };
3139 
3140   return g_icvs;
3141 }
3142 
3143 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3144 
3145   kmp_internal_control_t gx_icvs;
3146   gx_icvs.serial_nesting_level =
3147       0; // probably =team->t.t_serial like in save_inter_controls
3148   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3149   gx_icvs.next = NULL;
3150 
3151   return gx_icvs;
3152 }
3153 
3154 static void __kmp_initialize_root(kmp_root_t *root) {
3155   int f;
3156   kmp_team_t *root_team;
3157   kmp_team_t *hot_team;
3158   int hot_team_max_nth;
3159   kmp_r_sched_t r_sched =
3160       __kmp_get_schedule_global(); // get current state of scheduling globals
3161   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3162   KMP_DEBUG_ASSERT(root);
3163   KMP_ASSERT(!root->r.r_begin);
3164 
3165   /* setup the root state structure */
3166   __kmp_init_lock(&root->r.r_begin_lock);
3167   root->r.r_begin = FALSE;
3168   root->r.r_active = FALSE;
3169   root->r.r_in_parallel = 0;
3170   root->r.r_blocktime = __kmp_dflt_blocktime;
3171 
3172   /* setup the root team for this task */
3173   /* allocate the root team structure */
3174   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3175 
3176   root_team =
3177       __kmp_allocate_team(root,
3178                           1, // new_nproc
3179                           1, // max_nproc
3180 #if OMPT_SUPPORT
3181                           ompt_data_none, // root parallel id
3182 #endif
3183                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3184                           0 // argc
3185                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3186                           );
3187 #if USE_DEBUGGER
3188   // Non-NULL value should be assigned to make the debugger display the root
3189   // team.
3190   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3191 #endif
3192 
3193   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3194 
3195   root->r.r_root_team = root_team;
3196   root_team->t.t_control_stack_top = NULL;
3197 
3198   /* initialize root team */
3199   root_team->t.t_threads[0] = NULL;
3200   root_team->t.t_nproc = 1;
3201   root_team->t.t_serialized = 1;
3202   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3203   root_team->t.t_sched.sched = r_sched.sched;
3204   KA_TRACE(
3205       20,
3206       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3207        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3208 
3209   /* setup the  hot team for this task */
3210   /* allocate the hot team structure */
3211   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3212 
3213   hot_team =
3214       __kmp_allocate_team(root,
3215                           1, // new_nproc
3216                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3217 #if OMPT_SUPPORT
3218                           ompt_data_none, // root parallel id
3219 #endif
3220                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3221                           0 // argc
3222                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3223                           );
3224   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3225 
3226   root->r.r_hot_team = hot_team;
3227   root_team->t.t_control_stack_top = NULL;
3228 
3229   /* first-time initialization */
3230   hot_team->t.t_parent = root_team;
3231 
3232   /* initialize hot team */
3233   hot_team_max_nth = hot_team->t.t_max_nproc;
3234   for (f = 0; f < hot_team_max_nth; ++f) {
3235     hot_team->t.t_threads[f] = NULL;
3236   }
3237   hot_team->t.t_nproc = 1;
3238   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3239   hot_team->t.t_sched.sched = r_sched.sched;
3240   hot_team->t.t_size_changed = 0;
3241 }
3242 
3243 #ifdef KMP_DEBUG
3244 
3245 typedef struct kmp_team_list_item {
3246   kmp_team_p const *entry;
3247   struct kmp_team_list_item *next;
3248 } kmp_team_list_item_t;
3249 typedef kmp_team_list_item_t *kmp_team_list_t;
3250 
3251 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3252     kmp_team_list_t list, // List of teams.
3253     kmp_team_p const *team // Team to add.
3254     ) {
3255 
3256   // List must terminate with item where both entry and next are NULL.
3257   // Team is added to the list only once.
3258   // List is sorted in ascending order by team id.
3259   // Team id is *not* a key.
3260 
3261   kmp_team_list_t l;
3262 
3263   KMP_DEBUG_ASSERT(list != NULL);
3264   if (team == NULL) {
3265     return;
3266   }
3267 
3268   __kmp_print_structure_team_accum(list, team->t.t_parent);
3269   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3270 
3271   // Search list for the team.
3272   l = list;
3273   while (l->next != NULL && l->entry != team) {
3274     l = l->next;
3275   }
3276   if (l->next != NULL) {
3277     return; // Team has been added before, exit.
3278   }
3279 
3280   // Team is not found. Search list again for insertion point.
3281   l = list;
3282   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3283     l = l->next;
3284   }
3285 
3286   // Insert team.
3287   {
3288     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3289         sizeof(kmp_team_list_item_t));
3290     *item = *l;
3291     l->entry = team;
3292     l->next = item;
3293   }
3294 }
3295 
3296 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3297 
3298                                        ) {
3299   __kmp_printf("%s", title);
3300   if (team != NULL) {
3301     __kmp_printf("%2x %p\n", team->t.t_id, team);
3302   } else {
3303     __kmp_printf(" - (nil)\n");
3304   }
3305 }
3306 
3307 static void __kmp_print_structure_thread(char const *title,
3308                                          kmp_info_p const *thread) {
3309   __kmp_printf("%s", title);
3310   if (thread != NULL) {
3311     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3312   } else {
3313     __kmp_printf(" - (nil)\n");
3314   }
3315 }
3316 
3317 void __kmp_print_structure(void) {
3318 
3319   kmp_team_list_t list;
3320 
3321   // Initialize list of teams.
3322   list =
3323       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3324   list->entry = NULL;
3325   list->next = NULL;
3326 
3327   __kmp_printf("\n------------------------------\nGlobal Thread "
3328                "Table\n------------------------------\n");
3329   {
3330     int gtid;
3331     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3332       __kmp_printf("%2d", gtid);
3333       if (__kmp_threads != NULL) {
3334         __kmp_printf(" %p", __kmp_threads[gtid]);
3335       }
3336       if (__kmp_root != NULL) {
3337         __kmp_printf(" %p", __kmp_root[gtid]);
3338       }
3339       __kmp_printf("\n");
3340     }
3341   }
3342 
3343   // Print out __kmp_threads array.
3344   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3345                "----------\n");
3346   if (__kmp_threads != NULL) {
3347     int gtid;
3348     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3349       kmp_info_t const *thread = __kmp_threads[gtid];
3350       if (thread != NULL) {
3351         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3352         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3353         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3354         __kmp_print_structure_team("    Serial Team:  ",
3355                                    thread->th.th_serial_team);
3356         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3357         __kmp_print_structure_thread("    Master:       ",
3358                                      thread->th.th_team_master);
3359         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3360         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3361         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3362         __kmp_print_structure_thread("    Next in pool: ",
3363                                      thread->th.th_next_pool);
3364         __kmp_printf("\n");
3365         __kmp_print_structure_team_accum(list, thread->th.th_team);
3366         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3367       }
3368     }
3369   } else {
3370     __kmp_printf("Threads array is not allocated.\n");
3371   }
3372 
3373   // Print out __kmp_root array.
3374   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3375                "--------\n");
3376   if (__kmp_root != NULL) {
3377     int gtid;
3378     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3379       kmp_root_t const *root = __kmp_root[gtid];
3380       if (root != NULL) {
3381         __kmp_printf("GTID %2d %p:\n", gtid, root);
3382         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3383         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3384         __kmp_print_structure_thread("    Uber Thread:  ",
3385                                      root->r.r_uber_thread);
3386         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3387         __kmp_printf("    In Parallel:  %2d\n",
3388                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3389         __kmp_printf("\n");
3390         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3391         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3392       }
3393     }
3394   } else {
3395     __kmp_printf("Ubers array is not allocated.\n");
3396   }
3397 
3398   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3399                "--------\n");
3400   while (list->next != NULL) {
3401     kmp_team_p const *team = list->entry;
3402     int i;
3403     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3404     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3405     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3406     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3407     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3408     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3409     for (i = 0; i < team->t.t_nproc; ++i) {
3410       __kmp_printf("    Thread %2d:      ", i);
3411       __kmp_print_structure_thread("", team->t.t_threads[i]);
3412     }
3413     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3414     __kmp_printf("\n");
3415     list = list->next;
3416   }
3417 
3418   // Print out __kmp_thread_pool and __kmp_team_pool.
3419   __kmp_printf("\n------------------------------\nPools\n----------------------"
3420                "--------\n");
3421   __kmp_print_structure_thread("Thread pool:          ",
3422                                CCAST(kmp_info_t *, __kmp_thread_pool));
3423   __kmp_print_structure_team("Team pool:            ",
3424                              CCAST(kmp_team_t *, __kmp_team_pool));
3425   __kmp_printf("\n");
3426 
3427   // Free team list.
3428   while (list != NULL) {
3429     kmp_team_list_item_t *item = list;
3430     list = list->next;
3431     KMP_INTERNAL_FREE(item);
3432   }
3433 }
3434 
3435 #endif
3436 
3437 //---------------------------------------------------------------------------
3438 //  Stuff for per-thread fast random number generator
3439 //  Table of primes
3440 static const unsigned __kmp_primes[] = {
3441     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3442     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3443     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3444     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3445     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3446     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3447     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3448     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3449     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3450     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3451     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3452 
3453 //---------------------------------------------------------------------------
3454 //  __kmp_get_random: Get a random number using a linear congruential method.
3455 unsigned short __kmp_get_random(kmp_info_t *thread) {
3456   unsigned x = thread->th.th_x;
3457   unsigned short r = x >> 16;
3458 
3459   thread->th.th_x = x * thread->th.th_a + 1;
3460 
3461   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3462                 thread->th.th_info.ds.ds_tid, r));
3463 
3464   return r;
3465 }
3466 //--------------------------------------------------------
3467 // __kmp_init_random: Initialize a random number generator
3468 void __kmp_init_random(kmp_info_t *thread) {
3469   unsigned seed = thread->th.th_info.ds.ds_tid;
3470 
3471   thread->th.th_a =
3472       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3473   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3474   KA_TRACE(30,
3475            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3476 }
3477 
3478 #if KMP_OS_WINDOWS
3479 /* reclaim array entries for root threads that are already dead, returns number
3480  * reclaimed */
3481 static int __kmp_reclaim_dead_roots(void) {
3482   int i, r = 0;
3483 
3484   for (i = 0; i < __kmp_threads_capacity; ++i) {
3485     if (KMP_UBER_GTID(i) &&
3486         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3487         !__kmp_root[i]
3488              ->r.r_active) { // AC: reclaim only roots died in non-active state
3489       r += __kmp_unregister_root_other_thread(i);
3490     }
3491   }
3492   return r;
3493 }
3494 #endif
3495 
3496 /* This function attempts to create free entries in __kmp_threads and
3497    __kmp_root, and returns the number of free entries generated.
3498 
3499    For Windows* OS static library, the first mechanism used is to reclaim array
3500    entries for root threads that are already dead.
3501 
3502    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3503    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3504    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3505    threadprivate cache array has been created. Synchronization with
3506    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3507 
3508    After any dead root reclamation, if the clipping value allows array expansion
3509    to result in the generation of a total of nNeed free slots, the function does
3510    that expansion. If not, nothing is done beyond the possible initial root
3511    thread reclamation.
3512 
3513    If any argument is negative, the behavior is undefined. */
3514 static int __kmp_expand_threads(int nNeed) {
3515   int added = 0;
3516   int minimumRequiredCapacity;
3517   int newCapacity;
3518   kmp_info_t **newThreads;
3519   kmp_root_t **newRoot;
3520 
3521 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3522 // resizing __kmp_threads does not need additional protection if foreign
3523 // threads are present
3524 
3525 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3526   /* only for Windows static library */
3527   /* reclaim array entries for root threads that are already dead */
3528   added = __kmp_reclaim_dead_roots();
3529 
3530   if (nNeed) {
3531     nNeed -= added;
3532     if (nNeed < 0)
3533       nNeed = 0;
3534   }
3535 #endif
3536   if (nNeed <= 0)
3537     return added;
3538 
3539   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3540   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3541   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3542   // > __kmp_max_nth in one of two ways:
3543   //
3544   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3545   //    may not be reused by another thread, so we may need to increase
3546   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3547   //
3548   // 2) New foreign root(s) are encountered.  We always register new foreign
3549   //    roots. This may cause a smaller # of threads to be allocated at
3550   //    subsequent parallel regions, but the worker threads hang around (and
3551   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3552   //
3553   // Anyway, that is the reason for moving the check to see if
3554   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3555   // instead of having it performed here. -BB
3556 
3557   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3558 
3559   /* compute expansion headroom to check if we can expand */
3560   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3561     /* possible expansion too small -- give up */
3562     return added;
3563   }
3564   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3565 
3566   newCapacity = __kmp_threads_capacity;
3567   do {
3568     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3569                                                           : __kmp_sys_max_nth;
3570   } while (newCapacity < minimumRequiredCapacity);
3571   newThreads = (kmp_info_t **)__kmp_allocate(
3572       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3573   newRoot =
3574       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3575   KMP_MEMCPY(newThreads, __kmp_threads,
3576              __kmp_threads_capacity * sizeof(kmp_info_t *));
3577   KMP_MEMCPY(newRoot, __kmp_root,
3578              __kmp_threads_capacity * sizeof(kmp_root_t *));
3579 
3580   kmp_info_t **temp_threads = __kmp_threads;
3581   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3582   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3583   __kmp_free(temp_threads);
3584   added += newCapacity - __kmp_threads_capacity;
3585   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3586 
3587   if (newCapacity > __kmp_tp_capacity) {
3588     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3589     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3590       __kmp_threadprivate_resize_cache(newCapacity);
3591     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3592       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3593     }
3594     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3595   }
3596 
3597   return added;
3598 }
3599 
3600 /* Register the current thread as a root thread and obtain our gtid. We must
3601    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3602    thread that calls from __kmp_do_serial_initialize() */
3603 int __kmp_register_root(int initial_thread) {
3604   kmp_info_t *root_thread;
3605   kmp_root_t *root;
3606   int gtid;
3607   int capacity;
3608   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3609   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3610   KMP_MB();
3611 
3612   /* 2007-03-02:
3613      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3614      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3615      work as expected -- it may return false (that means there is at least one
3616      empty slot in __kmp_threads array), but it is possible the only free slot
3617      is #0, which is reserved for initial thread and so cannot be used for this
3618      one. Following code workarounds this bug.
3619 
3620      However, right solution seems to be not reserving slot #0 for initial
3621      thread because:
3622      (1) there is no magic in slot #0,
3623      (2) we cannot detect initial thread reliably (the first thread which does
3624         serial initialization may be not a real initial thread).
3625   */
3626   capacity = __kmp_threads_capacity;
3627   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3628     --capacity;
3629   }
3630 
3631   /* see if there are too many threads */
3632   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3633     if (__kmp_tp_cached) {
3634       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3635                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3636                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3637     } else {
3638       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3639                   __kmp_msg_null);
3640     }
3641   }
3642 
3643   /* find an available thread slot */
3644   /* Don't reassign the zero slot since we need that to only be used by initial
3645      thread */
3646   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3647        gtid++)
3648     ;
3649   KA_TRACE(1,
3650            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3651   KMP_ASSERT(gtid < __kmp_threads_capacity);
3652 
3653   /* update global accounting */
3654   __kmp_all_nth++;
3655   TCW_4(__kmp_nth, __kmp_nth + 1);
3656 
3657   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3658   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3659   if (__kmp_adjust_gtid_mode) {
3660     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3661       if (TCR_4(__kmp_gtid_mode) != 2) {
3662         TCW_4(__kmp_gtid_mode, 2);
3663       }
3664     } else {
3665       if (TCR_4(__kmp_gtid_mode) != 1) {
3666         TCW_4(__kmp_gtid_mode, 1);
3667       }
3668     }
3669   }
3670 
3671 #ifdef KMP_ADJUST_BLOCKTIME
3672   /* Adjust blocktime to zero if necessary            */
3673   /* Middle initialization might not have occurred yet */
3674   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3675     if (__kmp_nth > __kmp_avail_proc) {
3676       __kmp_zero_bt = TRUE;
3677     }
3678   }
3679 #endif /* KMP_ADJUST_BLOCKTIME */
3680 
3681   /* setup this new hierarchy */
3682   if (!(root = __kmp_root[gtid])) {
3683     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3684     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3685   }
3686 
3687 #if KMP_STATS_ENABLED
3688   // Initialize stats as soon as possible (right after gtid assignment).
3689   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3690   __kmp_stats_thread_ptr->startLife();
3691   KMP_SET_THREAD_STATE(SERIAL_REGION);
3692   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3693 #endif
3694   __kmp_initialize_root(root);
3695 
3696   /* setup new root thread structure */
3697   if (root->r.r_uber_thread) {
3698     root_thread = root->r.r_uber_thread;
3699   } else {
3700     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3701     if (__kmp_storage_map) {
3702       __kmp_print_thread_storage_map(root_thread, gtid);
3703     }
3704     root_thread->th.th_info.ds.ds_gtid = gtid;
3705 #if OMPT_SUPPORT
3706     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3707 #endif
3708     root_thread->th.th_root = root;
3709     if (__kmp_env_consistency_check) {
3710       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3711     }
3712 #if USE_FAST_MEMORY
3713     __kmp_initialize_fast_memory(root_thread);
3714 #endif /* USE_FAST_MEMORY */
3715 
3716 #if KMP_USE_BGET
3717     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3718     __kmp_initialize_bget(root_thread);
3719 #endif
3720     __kmp_init_random(root_thread); // Initialize random number generator
3721   }
3722 
3723   /* setup the serial team held in reserve by the root thread */
3724   if (!root_thread->th.th_serial_team) {
3725     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3726     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3727     root_thread->th.th_serial_team = __kmp_allocate_team(
3728         root, 1, 1,
3729 #if OMPT_SUPPORT
3730         ompt_data_none, // root parallel id
3731 #endif
3732         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3733   }
3734   KMP_ASSERT(root_thread->th.th_serial_team);
3735   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3736                 root_thread->th.th_serial_team));
3737 
3738   /* drop root_thread into place */
3739   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3740 
3741   root->r.r_root_team->t.t_threads[0] = root_thread;
3742   root->r.r_hot_team->t.t_threads[0] = root_thread;
3743   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3744   // AC: the team created in reserve, not for execution (it is unused for now).
3745   root_thread->th.th_serial_team->t.t_serialized = 0;
3746   root->r.r_uber_thread = root_thread;
3747 
3748   /* initialize the thread, get it ready to go */
3749   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3750   TCW_4(__kmp_init_gtid, TRUE);
3751 
3752   /* prepare the master thread for get_gtid() */
3753   __kmp_gtid_set_specific(gtid);
3754 
3755 #if USE_ITT_BUILD
3756   __kmp_itt_thread_name(gtid);
3757 #endif /* USE_ITT_BUILD */
3758 
3759 #ifdef KMP_TDATA_GTID
3760   __kmp_gtid = gtid;
3761 #endif
3762   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3763   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3764 
3765   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3766                 "plain=%u\n",
3767                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3768                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3769                 KMP_INIT_BARRIER_STATE));
3770   { // Initialize barrier data.
3771     int b;
3772     for (b = 0; b < bs_last_barrier; ++b) {
3773       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3774 #if USE_DEBUGGER
3775       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3776 #endif
3777     }
3778   }
3779   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3780                    KMP_INIT_BARRIER_STATE);
3781 
3782 #if KMP_AFFINITY_SUPPORTED
3783   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3786   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3787   if (TCR_4(__kmp_init_middle)) {
3788     __kmp_affinity_set_init_mask(gtid, TRUE);
3789   }
3790 #endif /* KMP_AFFINITY_SUPPORTED */
3791   root_thread->th.th_def_allocator = __kmp_def_allocator;
3792   root_thread->th.th_prev_level = 0;
3793   root_thread->th.th_prev_num_threads = 1;
3794 
3795   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3796   tmp->cg_root = root_thread;
3797   tmp->cg_thread_limit = __kmp_cg_max_nth;
3798   tmp->cg_nthreads = 1;
3799   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3800                  " cg_nthreads init to 1\n",
3801                  root_thread, tmp));
3802   tmp->up = NULL;
3803   root_thread->th.th_cg_roots = tmp;
3804 
3805   __kmp_root_counter++;
3806 
3807 #if OMPT_SUPPORT
3808   if (!initial_thread && ompt_enabled.enabled) {
3809 
3810     kmp_info_t *root_thread = ompt_get_thread();
3811 
3812     ompt_set_thread_state(root_thread, ompt_state_overhead);
3813 
3814     if (ompt_enabled.ompt_callback_thread_begin) {
3815       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3816           ompt_thread_initial, __ompt_get_thread_data_internal());
3817     }
3818     ompt_data_t *task_data;
3819     ompt_data_t *parallel_data;
3820     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3821     if (ompt_enabled.ompt_callback_implicit_task) {
3822       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3823           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3824     }
3825 
3826     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3827   }
3828 #endif
3829 
3830   KMP_MB();
3831   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3832 
3833   return gtid;
3834 }
3835 
3836 #if KMP_NESTED_HOT_TEAMS
3837 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3838                                 const int max_level) {
3839   int i, n, nth;
3840   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3841   if (!hot_teams || !hot_teams[level].hot_team) {
3842     return 0;
3843   }
3844   KMP_DEBUG_ASSERT(level < max_level);
3845   kmp_team_t *team = hot_teams[level].hot_team;
3846   nth = hot_teams[level].hot_team_nth;
3847   n = nth - 1; // master is not freed
3848   if (level < max_level - 1) {
3849     for (i = 0; i < nth; ++i) {
3850       kmp_info_t *th = team->t.t_threads[i];
3851       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3852       if (i > 0 && th->th.th_hot_teams) {
3853         __kmp_free(th->th.th_hot_teams);
3854         th->th.th_hot_teams = NULL;
3855       }
3856     }
3857   }
3858   __kmp_free_team(root, team, NULL);
3859   return n;
3860 }
3861 #endif
3862 
3863 // Resets a root thread and clear its root and hot teams.
3864 // Returns the number of __kmp_threads entries directly and indirectly freed.
3865 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3866   kmp_team_t *root_team = root->r.r_root_team;
3867   kmp_team_t *hot_team = root->r.r_hot_team;
3868   int n = hot_team->t.t_nproc;
3869   int i;
3870 
3871   KMP_DEBUG_ASSERT(!root->r.r_active);
3872 
3873   root->r.r_root_team = NULL;
3874   root->r.r_hot_team = NULL;
3875   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3876   // before call to __kmp_free_team().
3877   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3878 #if KMP_NESTED_HOT_TEAMS
3879   if (__kmp_hot_teams_max_level >
3880       0) { // need to free nested hot teams and their threads if any
3881     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3882       kmp_info_t *th = hot_team->t.t_threads[i];
3883       if (__kmp_hot_teams_max_level > 1) {
3884         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3885       }
3886       if (th->th.th_hot_teams) {
3887         __kmp_free(th->th.th_hot_teams);
3888         th->th.th_hot_teams = NULL;
3889       }
3890     }
3891   }
3892 #endif
3893   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3894 
3895   // Before we can reap the thread, we need to make certain that all other
3896   // threads in the teams that had this root as ancestor have stopped trying to
3897   // steal tasks.
3898   if (__kmp_tasking_mode != tskm_immediate_exec) {
3899     __kmp_wait_to_unref_task_teams();
3900   }
3901 
3902 #if KMP_OS_WINDOWS
3903   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3904   KA_TRACE(
3905       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3906            "\n",
3907            (LPVOID) & (root->r.r_uber_thread->th),
3908            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3909   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3910 #endif /* KMP_OS_WINDOWS */
3911 
3912 #if OMPT_SUPPORT
3913   ompt_data_t *task_data;
3914   ompt_data_t *parallel_data;
3915   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3916   if (ompt_enabled.ompt_callback_implicit_task) {
3917     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3918         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3919   }
3920   if (ompt_enabled.ompt_callback_thread_end) {
3921     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3922         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3923   }
3924 #endif
3925 
3926   TCW_4(__kmp_nth,
3927         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3928   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3929   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3930                  " to %d\n",
3931                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3932                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3933   if (i == 1) {
3934     // need to free contention group structure
3935     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3936                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3937     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3938     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3939     root->r.r_uber_thread->th.th_cg_roots = NULL;
3940   }
3941   __kmp_reap_thread(root->r.r_uber_thread, 1);
3942 
3943   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3944   // instead of freeing.
3945   root->r.r_uber_thread = NULL;
3946   /* mark root as no longer in use */
3947   root->r.r_begin = FALSE;
3948 
3949   return n;
3950 }
3951 
3952 void __kmp_unregister_root_current_thread(int gtid) {
3953   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3954   /* this lock should be ok, since unregister_root_current_thread is never
3955      called during an abort, only during a normal close. furthermore, if you
3956      have the forkjoin lock, you should never try to get the initz lock */
3957   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3958   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3959     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3960                   "exiting T#%d\n",
3961                   gtid));
3962     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3963     return;
3964   }
3965   kmp_root_t *root = __kmp_root[gtid];
3966 
3967   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3968   KMP_ASSERT(KMP_UBER_GTID(gtid));
3969   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3970   KMP_ASSERT(root->r.r_active == FALSE);
3971 
3972   KMP_MB();
3973 
3974   kmp_info_t *thread = __kmp_threads[gtid];
3975   kmp_team_t *team = thread->th.th_team;
3976   kmp_task_team_t *task_team = thread->th.th_task_team;
3977 
3978   // we need to wait for the proxy tasks before finishing the thread
3979   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3980 #if OMPT_SUPPORT
3981     // the runtime is shutting down so we won't report any events
3982     thread->th.ompt_thread_info.state = ompt_state_undefined;
3983 #endif
3984     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3985   }
3986 
3987   __kmp_reset_root(gtid, root);
3988 
3989   KMP_MB();
3990   KC_TRACE(10,
3991            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3992 
3993   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3994 }
3995 
3996 #if KMP_OS_WINDOWS
3997 /* __kmp_forkjoin_lock must be already held
3998    Unregisters a root thread that is not the current thread.  Returns the number
3999    of __kmp_threads entries freed as a result. */
4000 static int __kmp_unregister_root_other_thread(int gtid) {
4001   kmp_root_t *root = __kmp_root[gtid];
4002   int r;
4003 
4004   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4005   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4006   KMP_ASSERT(KMP_UBER_GTID(gtid));
4007   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4008   KMP_ASSERT(root->r.r_active == FALSE);
4009 
4010   r = __kmp_reset_root(gtid, root);
4011   KC_TRACE(10,
4012            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4013   return r;
4014 }
4015 #endif
4016 
4017 #if KMP_DEBUG
4018 void __kmp_task_info() {
4019 
4020   kmp_int32 gtid = __kmp_entry_gtid();
4021   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4022   kmp_info_t *this_thr = __kmp_threads[gtid];
4023   kmp_team_t *steam = this_thr->th.th_serial_team;
4024   kmp_team_t *team = this_thr->th.th_team;
4025 
4026   __kmp_printf(
4027       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4028       "ptask=%p\n",
4029       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4030       team->t.t_implicit_task_taskdata[tid].td_parent);
4031 }
4032 #endif // KMP_DEBUG
4033 
4034 /* TODO optimize with one big memclr, take out what isn't needed, split
4035    responsibility to workers as much as possible, and delay initialization of
4036    features as much as possible  */
4037 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4038                                   int tid, int gtid) {
4039   /* this_thr->th.th_info.ds.ds_gtid is setup in
4040      kmp_allocate_thread/create_worker.
4041      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4042   kmp_info_t *master = team->t.t_threads[0];
4043   KMP_DEBUG_ASSERT(this_thr != NULL);
4044   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4045   KMP_DEBUG_ASSERT(team);
4046   KMP_DEBUG_ASSERT(team->t.t_threads);
4047   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4048   KMP_DEBUG_ASSERT(master);
4049   KMP_DEBUG_ASSERT(master->th.th_root);
4050 
4051   KMP_MB();
4052 
4053   TCW_SYNC_PTR(this_thr->th.th_team, team);
4054 
4055   this_thr->th.th_info.ds.ds_tid = tid;
4056   this_thr->th.th_set_nproc = 0;
4057   if (__kmp_tasking_mode != tskm_immediate_exec)
4058     // When tasking is possible, threads are not safe to reap until they are
4059     // done tasking; this will be set when tasking code is exited in wait
4060     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4061   else // no tasking --> always safe to reap
4062     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4063   this_thr->th.th_set_proc_bind = proc_bind_default;
4064 #if KMP_AFFINITY_SUPPORTED
4065   this_thr->th.th_new_place = this_thr->th.th_current_place;
4066 #endif
4067   this_thr->th.th_root = master->th.th_root;
4068 
4069   /* setup the thread's cache of the team structure */
4070   this_thr->th.th_team_nproc = team->t.t_nproc;
4071   this_thr->th.th_team_master = master;
4072   this_thr->th.th_team_serialized = team->t.t_serialized;
4073   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4074 
4075   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4076 
4077   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4078                 tid, gtid, this_thr, this_thr->th.th_current_task));
4079 
4080   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4081                            team, tid, TRUE);
4082 
4083   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4084                 tid, gtid, this_thr, this_thr->th.th_current_task));
4085   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4086   // __kmp_initialize_team()?
4087 
4088   /* TODO no worksharing in speculative threads */
4089   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4090 
4091   this_thr->th.th_local.this_construct = 0;
4092 
4093   if (!this_thr->th.th_pri_common) {
4094     this_thr->th.th_pri_common =
4095         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4096     if (__kmp_storage_map) {
4097       __kmp_print_storage_map_gtid(
4098           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4099           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4100     }
4101     this_thr->th.th_pri_head = NULL;
4102   }
4103 
4104   if (this_thr != master && // Master's CG root is initialized elsewhere
4105       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4106     // Make new thread's CG root same as master's
4107     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4108     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4109     if (tmp) {
4110       // worker changes CG, need to check if old CG should be freed
4111       int i = tmp->cg_nthreads--;
4112       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4113                      " on node %p of thread %p to %d\n",
4114                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4115       if (i == 1) {
4116         __kmp_free(tmp); // last thread left CG --> free it
4117       }
4118     }
4119     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4120     // Increment new thread's CG root's counter to add the new thread
4121     this_thr->th.th_cg_roots->cg_nthreads++;
4122     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4123                    " node %p of thread %p to %d\n",
4124                    this_thr, this_thr->th.th_cg_roots,
4125                    this_thr->th.th_cg_roots->cg_root,
4126                    this_thr->th.th_cg_roots->cg_nthreads));
4127     this_thr->th.th_current_task->td_icvs.thread_limit =
4128         this_thr->th.th_cg_roots->cg_thread_limit;
4129   }
4130 
4131   /* Initialize dynamic dispatch */
4132   {
4133     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4134     // Use team max_nproc since this will never change for the team.
4135     size_t disp_size =
4136         sizeof(dispatch_private_info_t) *
4137         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4138     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4139                   team->t.t_max_nproc));
4140     KMP_ASSERT(dispatch);
4141     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4142     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4143 
4144     dispatch->th_disp_index = 0;
4145     dispatch->th_doacross_buf_idx = 0;
4146     if (!dispatch->th_disp_buffer) {
4147       dispatch->th_disp_buffer =
4148           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4149 
4150       if (__kmp_storage_map) {
4151         __kmp_print_storage_map_gtid(
4152             gtid, &dispatch->th_disp_buffer[0],
4153             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4154                                           ? 1
4155                                           : __kmp_dispatch_num_buffers],
4156             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4157                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4158             gtid, team->t.t_id, gtid);
4159       }
4160     } else {
4161       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4162     }
4163 
4164     dispatch->th_dispatch_pr_current = 0;
4165     dispatch->th_dispatch_sh_current = 0;
4166 
4167     dispatch->th_deo_fcn = 0; /* ORDERED     */
4168     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4169   }
4170 
4171   this_thr->th.th_next_pool = NULL;
4172 
4173   if (!this_thr->th.th_task_state_memo_stack) {
4174     size_t i;
4175     this_thr->th.th_task_state_memo_stack =
4176         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4177     this_thr->th.th_task_state_top = 0;
4178     this_thr->th.th_task_state_stack_sz = 4;
4179     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4180          ++i) // zero init the stack
4181       this_thr->th.th_task_state_memo_stack[i] = 0;
4182   }
4183 
4184   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4185   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4186 
4187   KMP_MB();
4188 }
4189 
4190 /* allocate a new thread for the requesting team. this is only called from
4191    within a forkjoin critical section. we will first try to get an available
4192    thread from the thread pool. if none is available, we will fork a new one
4193    assuming we are able to create a new one. this should be assured, as the
4194    caller should check on this first. */
4195 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4196                                   int new_tid) {
4197   kmp_team_t *serial_team;
4198   kmp_info_t *new_thr;
4199   int new_gtid;
4200 
4201   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4202   KMP_DEBUG_ASSERT(root && team);
4203 #if !KMP_NESTED_HOT_TEAMS
4204   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4205 #endif
4206   KMP_MB();
4207 
4208   /* first, try to get one from the thread pool */
4209   if (__kmp_thread_pool) {
4210     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4211     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4212     if (new_thr == __kmp_thread_pool_insert_pt) {
4213       __kmp_thread_pool_insert_pt = NULL;
4214     }
4215     TCW_4(new_thr->th.th_in_pool, FALSE);
4216     __kmp_suspend_initialize_thread(new_thr);
4217     __kmp_lock_suspend_mx(new_thr);
4218     if (new_thr->th.th_active_in_pool == TRUE) {
4219       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4220       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4221       new_thr->th.th_active_in_pool = FALSE;
4222     }
4223     __kmp_unlock_suspend_mx(new_thr);
4224 
4225     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4226                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4227     KMP_ASSERT(!new_thr->th.th_team);
4228     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4229 
4230     /* setup the thread structure */
4231     __kmp_initialize_info(new_thr, team, new_tid,
4232                           new_thr->th.th_info.ds.ds_gtid);
4233     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4234 
4235     TCW_4(__kmp_nth, __kmp_nth + 1);
4236 
4237     new_thr->th.th_task_state = 0;
4238     new_thr->th.th_task_state_top = 0;
4239     new_thr->th.th_task_state_stack_sz = 4;
4240 
4241 #ifdef KMP_ADJUST_BLOCKTIME
4242     /* Adjust blocktime back to zero if necessary */
4243     /* Middle initialization might not have occurred yet */
4244     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4245       if (__kmp_nth > __kmp_avail_proc) {
4246         __kmp_zero_bt = TRUE;
4247       }
4248     }
4249 #endif /* KMP_ADJUST_BLOCKTIME */
4250 
4251 #if KMP_DEBUG
4252     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4253     // KMP_BARRIER_PARENT_FLAG.
4254     int b;
4255     kmp_balign_t *balign = new_thr->th.th_bar;
4256     for (b = 0; b < bs_last_barrier; ++b)
4257       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4258 #endif
4259 
4260     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4261                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4262 
4263     KMP_MB();
4264     return new_thr;
4265   }
4266 
4267   /* no, well fork a new one */
4268   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4269   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4270 
4271 #if KMP_USE_MONITOR
4272   // If this is the first worker thread the RTL is creating, then also
4273   // launch the monitor thread.  We try to do this as early as possible.
4274   if (!TCR_4(__kmp_init_monitor)) {
4275     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4276     if (!TCR_4(__kmp_init_monitor)) {
4277       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4278       TCW_4(__kmp_init_monitor, 1);
4279       __kmp_create_monitor(&__kmp_monitor);
4280       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4281 #if KMP_OS_WINDOWS
4282       // AC: wait until monitor has started. This is a fix for CQ232808.
4283       // The reason is that if the library is loaded/unloaded in a loop with
4284       // small (parallel) work in between, then there is high probability that
4285       // monitor thread started after the library shutdown. At shutdown it is
4286       // too late to cope with the problem, because when the master is in
4287       // DllMain (process detach) the monitor has no chances to start (it is
4288       // blocked), and master has no means to inform the monitor that the
4289       // library has gone, because all the memory which the monitor can access
4290       // is going to be released/reset.
4291       while (TCR_4(__kmp_init_monitor) < 2) {
4292         KMP_YIELD(TRUE);
4293       }
4294       KF_TRACE(10, ("after monitor thread has started\n"));
4295 #endif
4296     }
4297     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4298   }
4299 #endif
4300 
4301   KMP_MB();
4302   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4303     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4304   }
4305 
4306   /* allocate space for it. */
4307   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4308 
4309   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4310 
4311 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4312   // suppress race conditions detection on synchronization flags in debug mode
4313   // this helps to analyze library internals eliminating false positives
4314   __itt_suppress_mark_range(
4315       __itt_suppress_range, __itt_suppress_threading_errors,
4316       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4317   __itt_suppress_mark_range(
4318       __itt_suppress_range, __itt_suppress_threading_errors,
4319       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4320 #if KMP_OS_WINDOWS
4321   __itt_suppress_mark_range(
4322       __itt_suppress_range, __itt_suppress_threading_errors,
4323       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4324 #else
4325   __itt_suppress_mark_range(__itt_suppress_range,
4326                             __itt_suppress_threading_errors,
4327                             &new_thr->th.th_suspend_init_count,
4328                             sizeof(new_thr->th.th_suspend_init_count));
4329 #endif
4330   // TODO: check if we need to also suppress b_arrived flags
4331   __itt_suppress_mark_range(__itt_suppress_range,
4332                             __itt_suppress_threading_errors,
4333                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4334                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4335   __itt_suppress_mark_range(__itt_suppress_range,
4336                             __itt_suppress_threading_errors,
4337                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4338                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4339   __itt_suppress_mark_range(__itt_suppress_range,
4340                             __itt_suppress_threading_errors,
4341                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4342                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4343 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4344   if (__kmp_storage_map) {
4345     __kmp_print_thread_storage_map(new_thr, new_gtid);
4346   }
4347 
4348   // add the reserve serialized team, initialized from the team's master thread
4349   {
4350     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4351     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4352     new_thr->th.th_serial_team = serial_team =
4353         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4354 #if OMPT_SUPPORT
4355                                           ompt_data_none, // root parallel id
4356 #endif
4357                                           proc_bind_default, &r_icvs,
4358                                           0 USE_NESTED_HOT_ARG(NULL));
4359   }
4360   KMP_ASSERT(serial_team);
4361   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4362   // execution (it is unused for now).
4363   serial_team->t.t_threads[0] = new_thr;
4364   KF_TRACE(10,
4365            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4366             new_thr));
4367 
4368   /* setup the thread structures */
4369   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4370 
4371 #if USE_FAST_MEMORY
4372   __kmp_initialize_fast_memory(new_thr);
4373 #endif /* USE_FAST_MEMORY */
4374 
4375 #if KMP_USE_BGET
4376   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4377   __kmp_initialize_bget(new_thr);
4378 #endif
4379 
4380   __kmp_init_random(new_thr); // Initialize random number generator
4381 
4382   /* Initialize these only once when thread is grabbed for a team allocation */
4383   KA_TRACE(20,
4384            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4385             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4386 
4387   int b;
4388   kmp_balign_t *balign = new_thr->th.th_bar;
4389   for (b = 0; b < bs_last_barrier; ++b) {
4390     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4391     balign[b].bb.team = NULL;
4392     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4393     balign[b].bb.use_oncore_barrier = 0;
4394   }
4395 
4396   new_thr->th.th_spin_here = FALSE;
4397   new_thr->th.th_next_waiting = 0;
4398 #if KMP_OS_UNIX
4399   new_thr->th.th_blocking = false;
4400 #endif
4401 
4402 #if KMP_AFFINITY_SUPPORTED
4403   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4404   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4405   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4406   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4407 #endif
4408   new_thr->th.th_def_allocator = __kmp_def_allocator;
4409   new_thr->th.th_prev_level = 0;
4410   new_thr->th.th_prev_num_threads = 1;
4411 
4412   TCW_4(new_thr->th.th_in_pool, FALSE);
4413   new_thr->th.th_active_in_pool = FALSE;
4414   TCW_4(new_thr->th.th_active, TRUE);
4415 
4416   /* adjust the global counters */
4417   __kmp_all_nth++;
4418   __kmp_nth++;
4419 
4420   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4421   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4422   if (__kmp_adjust_gtid_mode) {
4423     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4424       if (TCR_4(__kmp_gtid_mode) != 2) {
4425         TCW_4(__kmp_gtid_mode, 2);
4426       }
4427     } else {
4428       if (TCR_4(__kmp_gtid_mode) != 1) {
4429         TCW_4(__kmp_gtid_mode, 1);
4430       }
4431     }
4432   }
4433 
4434 #ifdef KMP_ADJUST_BLOCKTIME
4435   /* Adjust blocktime back to zero if necessary       */
4436   /* Middle initialization might not have occurred yet */
4437   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4438     if (__kmp_nth > __kmp_avail_proc) {
4439       __kmp_zero_bt = TRUE;
4440     }
4441   }
4442 #endif /* KMP_ADJUST_BLOCKTIME */
4443 
4444   /* actually fork it and create the new worker thread */
4445   KF_TRACE(
4446       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4447   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4448   KF_TRACE(10,
4449            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4450 
4451   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4452                 new_gtid));
4453   KMP_MB();
4454   return new_thr;
4455 }
4456 
4457 /* Reinitialize team for reuse.
4458    The hot team code calls this case at every fork barrier, so EPCC barrier
4459    test are extremely sensitive to changes in it, esp. writes to the team
4460    struct, which cause a cache invalidation in all threads.
4461    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4462 static void __kmp_reinitialize_team(kmp_team_t *team,
4463                                     kmp_internal_control_t *new_icvs,
4464                                     ident_t *loc) {
4465   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4466                 team->t.t_threads[0], team));
4467   KMP_DEBUG_ASSERT(team && new_icvs);
4468   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4469   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4470 
4471   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4472   // Copy ICVs to the master thread's implicit taskdata
4473   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4474   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4475 
4476   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4477                 team->t.t_threads[0], team));
4478 }
4479 
4480 /* Initialize the team data structure.
4481    This assumes the t_threads and t_max_nproc are already set.
4482    Also, we don't touch the arguments */
4483 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4484                                   kmp_internal_control_t *new_icvs,
4485                                   ident_t *loc) {
4486   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4487 
4488   /* verify */
4489   KMP_DEBUG_ASSERT(team);
4490   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4491   KMP_DEBUG_ASSERT(team->t.t_threads);
4492   KMP_MB();
4493 
4494   team->t.t_master_tid = 0; /* not needed */
4495   /* team->t.t_master_bar;        not needed */
4496   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4497   team->t.t_nproc = new_nproc;
4498 
4499   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4500   team->t.t_next_pool = NULL;
4501   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4502    * up hot team */
4503 
4504   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4505   team->t.t_invoke = NULL; /* not needed */
4506 
4507   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4508   team->t.t_sched.sched = new_icvs->sched.sched;
4509 
4510 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4511   team->t.t_fp_control_saved = FALSE; /* not needed */
4512   team->t.t_x87_fpu_control_word = 0; /* not needed */
4513   team->t.t_mxcsr = 0; /* not needed */
4514 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4515 
4516   team->t.t_construct = 0;
4517 
4518   team->t.t_ordered.dt.t_value = 0;
4519   team->t.t_master_active = FALSE;
4520 
4521 #ifdef KMP_DEBUG
4522   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4523 #endif
4524 #if KMP_OS_WINDOWS
4525   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4526 #endif
4527 
4528   team->t.t_control_stack_top = NULL;
4529 
4530   __kmp_reinitialize_team(team, new_icvs, loc);
4531 
4532   KMP_MB();
4533   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4534 }
4535 
4536 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4537 /* Sets full mask for thread and returns old mask, no changes to structures. */
4538 static void
4539 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4540   if (KMP_AFFINITY_CAPABLE()) {
4541     int status;
4542     if (old_mask != NULL) {
4543       status = __kmp_get_system_affinity(old_mask, TRUE);
4544       int error = errno;
4545       if (status != 0) {
4546         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4547                     __kmp_msg_null);
4548       }
4549     }
4550     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4551   }
4552 }
4553 #endif
4554 
4555 #if KMP_AFFINITY_SUPPORTED
4556 
4557 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4558 // It calculates the worker + master thread's partition based upon the parent
4559 // thread's partition, and binds each worker to a thread in their partition.
4560 // The master thread's partition should already include its current binding.
4561 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4562   // Copy the master thread's place partition to the team struct
4563   kmp_info_t *master_th = team->t.t_threads[0];
4564   KMP_DEBUG_ASSERT(master_th != NULL);
4565   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4566   int first_place = master_th->th.th_first_place;
4567   int last_place = master_th->th.th_last_place;
4568   int masters_place = master_th->th.th_current_place;
4569   team->t.t_first_place = first_place;
4570   team->t.t_last_place = last_place;
4571 
4572   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4573                 "bound to place %d partition = [%d,%d]\n",
4574                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4575                 team->t.t_id, masters_place, first_place, last_place));
4576 
4577   switch (proc_bind) {
4578 
4579   case proc_bind_default:
4580     // serial teams might have the proc_bind policy set to proc_bind_default. It
4581     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4582     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4583     break;
4584 
4585   case proc_bind_master: {
4586     int f;
4587     int n_th = team->t.t_nproc;
4588     for (f = 1; f < n_th; f++) {
4589       kmp_info_t *th = team->t.t_threads[f];
4590       KMP_DEBUG_ASSERT(th != NULL);
4591       th->th.th_first_place = first_place;
4592       th->th.th_last_place = last_place;
4593       th->th.th_new_place = masters_place;
4594       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4595           team->t.t_display_affinity != 1) {
4596         team->t.t_display_affinity = 1;
4597       }
4598 
4599       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4600                      "partition = [%d,%d]\n",
4601                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4602                      f, masters_place, first_place, last_place));
4603     }
4604   } break;
4605 
4606   case proc_bind_close: {
4607     int f;
4608     int n_th = team->t.t_nproc;
4609     int n_places;
4610     if (first_place <= last_place) {
4611       n_places = last_place - first_place + 1;
4612     } else {
4613       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4614     }
4615     if (n_th <= n_places) {
4616       int place = masters_place;
4617       for (f = 1; f < n_th; f++) {
4618         kmp_info_t *th = team->t.t_threads[f];
4619         KMP_DEBUG_ASSERT(th != NULL);
4620 
4621         if (place == last_place) {
4622           place = first_place;
4623         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4624           place = 0;
4625         } else {
4626           place++;
4627         }
4628         th->th.th_first_place = first_place;
4629         th->th.th_last_place = last_place;
4630         th->th.th_new_place = place;
4631         if (__kmp_display_affinity && place != th->th.th_current_place &&
4632             team->t.t_display_affinity != 1) {
4633           team->t.t_display_affinity = 1;
4634         }
4635 
4636         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4637                        "partition = [%d,%d]\n",
4638                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4639                        team->t.t_id, f, place, first_place, last_place));
4640       }
4641     } else {
4642       int S, rem, gap, s_count;
4643       S = n_th / n_places;
4644       s_count = 0;
4645       rem = n_th - (S * n_places);
4646       gap = rem > 0 ? n_places / rem : n_places;
4647       int place = masters_place;
4648       int gap_ct = gap;
4649       for (f = 0; f < n_th; f++) {
4650         kmp_info_t *th = team->t.t_threads[f];
4651         KMP_DEBUG_ASSERT(th != NULL);
4652 
4653         th->th.th_first_place = first_place;
4654         th->th.th_last_place = last_place;
4655         th->th.th_new_place = place;
4656         if (__kmp_display_affinity && place != th->th.th_current_place &&
4657             team->t.t_display_affinity != 1) {
4658           team->t.t_display_affinity = 1;
4659         }
4660         s_count++;
4661 
4662         if ((s_count == S) && rem && (gap_ct == gap)) {
4663           // do nothing, add an extra thread to place on next iteration
4664         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4665           // we added an extra thread to this place; move to next place
4666           if (place == last_place) {
4667             place = first_place;
4668           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4669             place = 0;
4670           } else {
4671             place++;
4672           }
4673           s_count = 0;
4674           gap_ct = 1;
4675           rem--;
4676         } else if (s_count == S) { // place full; don't add extra
4677           if (place == last_place) {
4678             place = first_place;
4679           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4680             place = 0;
4681           } else {
4682             place++;
4683           }
4684           gap_ct++;
4685           s_count = 0;
4686         }
4687 
4688         KA_TRACE(100,
4689                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4690                   "partition = [%d,%d]\n",
4691                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4692                   th->th.th_new_place, first_place, last_place));
4693       }
4694       KMP_DEBUG_ASSERT(place == masters_place);
4695     }
4696   } break;
4697 
4698   case proc_bind_spread: {
4699     int f;
4700     int n_th = team->t.t_nproc;
4701     int n_places;
4702     int thidx;
4703     if (first_place <= last_place) {
4704       n_places = last_place - first_place + 1;
4705     } else {
4706       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4707     }
4708     if (n_th <= n_places) {
4709       int place = -1;
4710 
4711       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4712         int S = n_places / n_th;
4713         int s_count, rem, gap, gap_ct;
4714 
4715         place = masters_place;
4716         rem = n_places - n_th * S;
4717         gap = rem ? n_th / rem : 1;
4718         gap_ct = gap;
4719         thidx = n_th;
4720         if (update_master_only == 1)
4721           thidx = 1;
4722         for (f = 0; f < thidx; f++) {
4723           kmp_info_t *th = team->t.t_threads[f];
4724           KMP_DEBUG_ASSERT(th != NULL);
4725 
4726           th->th.th_first_place = place;
4727           th->th.th_new_place = place;
4728           if (__kmp_display_affinity && place != th->th.th_current_place &&
4729               team->t.t_display_affinity != 1) {
4730             team->t.t_display_affinity = 1;
4731           }
4732           s_count = 1;
4733           while (s_count < S) {
4734             if (place == last_place) {
4735               place = first_place;
4736             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4737               place = 0;
4738             } else {
4739               place++;
4740             }
4741             s_count++;
4742           }
4743           if (rem && (gap_ct == gap)) {
4744             if (place == last_place) {
4745               place = first_place;
4746             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4747               place = 0;
4748             } else {
4749               place++;
4750             }
4751             rem--;
4752             gap_ct = 0;
4753           }
4754           th->th.th_last_place = place;
4755           gap_ct++;
4756 
4757           if (place == last_place) {
4758             place = first_place;
4759           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4760             place = 0;
4761           } else {
4762             place++;
4763           }
4764 
4765           KA_TRACE(100,
4766                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4767                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4768                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4769                     f, th->th.th_new_place, th->th.th_first_place,
4770                     th->th.th_last_place, __kmp_affinity_num_masks));
4771         }
4772       } else {
4773         /* Having uniform space of available computation places I can create
4774            T partitions of round(P/T) size and put threads into the first
4775            place of each partition. */
4776         double current = static_cast<double>(masters_place);
4777         double spacing =
4778             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4779         int first, last;
4780         kmp_info_t *th;
4781 
4782         thidx = n_th + 1;
4783         if (update_master_only == 1)
4784           thidx = 1;
4785         for (f = 0; f < thidx; f++) {
4786           first = static_cast<int>(current);
4787           last = static_cast<int>(current + spacing) - 1;
4788           KMP_DEBUG_ASSERT(last >= first);
4789           if (first >= n_places) {
4790             if (masters_place) {
4791               first -= n_places;
4792               last -= n_places;
4793               if (first == (masters_place + 1)) {
4794                 KMP_DEBUG_ASSERT(f == n_th);
4795                 first--;
4796               }
4797               if (last == masters_place) {
4798                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4799                 last--;
4800               }
4801             } else {
4802               KMP_DEBUG_ASSERT(f == n_th);
4803               first = 0;
4804               last = 0;
4805             }
4806           }
4807           if (last >= n_places) {
4808             last = (n_places - 1);
4809           }
4810           place = first;
4811           current += spacing;
4812           if (f < n_th) {
4813             KMP_DEBUG_ASSERT(0 <= first);
4814             KMP_DEBUG_ASSERT(n_places > first);
4815             KMP_DEBUG_ASSERT(0 <= last);
4816             KMP_DEBUG_ASSERT(n_places > last);
4817             KMP_DEBUG_ASSERT(last_place >= first_place);
4818             th = team->t.t_threads[f];
4819             KMP_DEBUG_ASSERT(th);
4820             th->th.th_first_place = first;
4821             th->th.th_new_place = place;
4822             th->th.th_last_place = last;
4823             if (__kmp_display_affinity && place != th->th.th_current_place &&
4824                 team->t.t_display_affinity != 1) {
4825               team->t.t_display_affinity = 1;
4826             }
4827             KA_TRACE(100,
4828                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4829                       "partition = [%d,%d], spacing = %.4f\n",
4830                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4831                       team->t.t_id, f, th->th.th_new_place,
4832                       th->th.th_first_place, th->th.th_last_place, spacing));
4833           }
4834         }
4835       }
4836       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4837     } else {
4838       int S, rem, gap, s_count;
4839       S = n_th / n_places;
4840       s_count = 0;
4841       rem = n_th - (S * n_places);
4842       gap = rem > 0 ? n_places / rem : n_places;
4843       int place = masters_place;
4844       int gap_ct = gap;
4845       thidx = n_th;
4846       if (update_master_only == 1)
4847         thidx = 1;
4848       for (f = 0; f < thidx; f++) {
4849         kmp_info_t *th = team->t.t_threads[f];
4850         KMP_DEBUG_ASSERT(th != NULL);
4851 
4852         th->th.th_first_place = place;
4853         th->th.th_last_place = place;
4854         th->th.th_new_place = place;
4855         if (__kmp_display_affinity && place != th->th.th_current_place &&
4856             team->t.t_display_affinity != 1) {
4857           team->t.t_display_affinity = 1;
4858         }
4859         s_count++;
4860 
4861         if ((s_count == S) && rem && (gap_ct == gap)) {
4862           // do nothing, add an extra thread to place on next iteration
4863         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4864           // we added an extra thread to this place; move on to next place
4865           if (place == last_place) {
4866             place = first_place;
4867           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4868             place = 0;
4869           } else {
4870             place++;
4871           }
4872           s_count = 0;
4873           gap_ct = 1;
4874           rem--;
4875         } else if (s_count == S) { // place is full; don't add extra thread
4876           if (place == last_place) {
4877             place = first_place;
4878           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4879             place = 0;
4880           } else {
4881             place++;
4882           }
4883           gap_ct++;
4884           s_count = 0;
4885         }
4886 
4887         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4888                        "partition = [%d,%d]\n",
4889                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4890                        team->t.t_id, f, th->th.th_new_place,
4891                        th->th.th_first_place, th->th.th_last_place));
4892       }
4893       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4894     }
4895   } break;
4896 
4897   default:
4898     break;
4899   }
4900 
4901   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4902 }
4903 
4904 #endif // KMP_AFFINITY_SUPPORTED
4905 
4906 /* allocate a new team data structure to use.  take one off of the free pool if
4907    available */
4908 kmp_team_t *
4909 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4910 #if OMPT_SUPPORT
4911                     ompt_data_t ompt_parallel_data,
4912 #endif
4913                     kmp_proc_bind_t new_proc_bind,
4914                     kmp_internal_control_t *new_icvs,
4915                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4916   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4917   int f;
4918   kmp_team_t *team;
4919   int use_hot_team = !root->r.r_active;
4920   int level = 0;
4921 
4922   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4923   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4924   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4925   KMP_MB();
4926 
4927 #if KMP_NESTED_HOT_TEAMS
4928   kmp_hot_team_ptr_t *hot_teams;
4929   if (master) {
4930     team = master->th.th_team;
4931     level = team->t.t_active_level;
4932     if (master->th.th_teams_microtask) { // in teams construct?
4933       if (master->th.th_teams_size.nteams > 1 &&
4934           ( // #teams > 1
4935               team->t.t_pkfn ==
4936                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4937               master->th.th_teams_level <
4938                   team->t.t_level)) { // or nested parallel inside the teams
4939         ++level; // not increment if #teams==1, or for outer fork of the teams;
4940         // increment otherwise
4941       }
4942     }
4943     hot_teams = master->th.th_hot_teams;
4944     if (level < __kmp_hot_teams_max_level && hot_teams &&
4945         hot_teams[level].hot_team) {
4946       // hot team has already been allocated for given level
4947       use_hot_team = 1;
4948     } else {
4949       use_hot_team = 0;
4950     }
4951   } else {
4952     // check we won't access uninitialized hot_teams, just in case
4953     KMP_DEBUG_ASSERT(new_nproc == 1);
4954   }
4955 #endif
4956   // Optimization to use a "hot" team
4957   if (use_hot_team && new_nproc > 1) {
4958     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4959 #if KMP_NESTED_HOT_TEAMS
4960     team = hot_teams[level].hot_team;
4961 #else
4962     team = root->r.r_hot_team;
4963 #endif
4964 #if KMP_DEBUG
4965     if (__kmp_tasking_mode != tskm_immediate_exec) {
4966       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4967                     "task_team[1] = %p before reinit\n",
4968                     team->t.t_task_team[0], team->t.t_task_team[1]));
4969     }
4970 #endif
4971 
4972     // Has the number of threads changed?
4973     /* Let's assume the most common case is that the number of threads is
4974        unchanged, and put that case first. */
4975     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4976       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4977       // This case can mean that omp_set_num_threads() was called and the hot
4978       // team size was already reduced, so we check the special flag
4979       if (team->t.t_size_changed == -1) {
4980         team->t.t_size_changed = 1;
4981       } else {
4982         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4983       }
4984 
4985       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4986       kmp_r_sched_t new_sched = new_icvs->sched;
4987       // set master's schedule as new run-time schedule
4988       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4989 
4990       __kmp_reinitialize_team(team, new_icvs,
4991                               root->r.r_uber_thread->th.th_ident);
4992 
4993       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4994                     team->t.t_threads[0], team));
4995       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4996 
4997 #if KMP_AFFINITY_SUPPORTED
4998       if ((team->t.t_size_changed == 0) &&
4999           (team->t.t_proc_bind == new_proc_bind)) {
5000         if (new_proc_bind == proc_bind_spread) {
5001           __kmp_partition_places(
5002               team, 1); // add flag to update only master for spread
5003         }
5004         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5005                        "proc_bind = %d, partition = [%d,%d]\n",
5006                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5007                        team->t.t_last_place));
5008       } else {
5009         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5010         __kmp_partition_places(team);
5011       }
5012 #else
5013       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5014 #endif /* KMP_AFFINITY_SUPPORTED */
5015     } else if (team->t.t_nproc > new_nproc) {
5016       KA_TRACE(20,
5017                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5018                 new_nproc));
5019 
5020       team->t.t_size_changed = 1;
5021 #if KMP_NESTED_HOT_TEAMS
5022       if (__kmp_hot_teams_mode == 0) {
5023         // AC: saved number of threads should correspond to team's value in this
5024         // mode, can be bigger in mode 1, when hot team has threads in reserve
5025         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5026         hot_teams[level].hot_team_nth = new_nproc;
5027 #endif // KMP_NESTED_HOT_TEAMS
5028         /* release the extra threads we don't need any more */
5029         for (f = new_nproc; f < team->t.t_nproc; f++) {
5030           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5031           if (__kmp_tasking_mode != tskm_immediate_exec) {
5032             // When decreasing team size, threads no longer in the team should
5033             // unref task team.
5034             team->t.t_threads[f]->th.th_task_team = NULL;
5035           }
5036           __kmp_free_thread(team->t.t_threads[f]);
5037           team->t.t_threads[f] = NULL;
5038         }
5039 #if KMP_NESTED_HOT_TEAMS
5040       } // (__kmp_hot_teams_mode == 0)
5041       else {
5042         // When keeping extra threads in team, switch threads to wait on own
5043         // b_go flag
5044         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5045           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5046           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5047           for (int b = 0; b < bs_last_barrier; ++b) {
5048             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5049               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5050             }
5051             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5052           }
5053         }
5054       }
5055 #endif // KMP_NESTED_HOT_TEAMS
5056       team->t.t_nproc = new_nproc;
5057       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5058       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5059       __kmp_reinitialize_team(team, new_icvs,
5060                               root->r.r_uber_thread->th.th_ident);
5061 
5062       // Update remaining threads
5063       for (f = 0; f < new_nproc; ++f) {
5064         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5065       }
5066 
5067       // restore the current task state of the master thread: should be the
5068       // implicit task
5069       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5070                     team->t.t_threads[0], team));
5071 
5072       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5073 
5074 #ifdef KMP_DEBUG
5075       for (f = 0; f < team->t.t_nproc; f++) {
5076         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5077                          team->t.t_threads[f]->th.th_team_nproc ==
5078                              team->t.t_nproc);
5079       }
5080 #endif
5081 
5082       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5083 #if KMP_AFFINITY_SUPPORTED
5084       __kmp_partition_places(team);
5085 #endif
5086     } else { // team->t.t_nproc < new_nproc
5087 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5088       kmp_affin_mask_t *old_mask;
5089       if (KMP_AFFINITY_CAPABLE()) {
5090         KMP_CPU_ALLOC(old_mask);
5091       }
5092 #endif
5093 
5094       KA_TRACE(20,
5095                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5096                 new_nproc));
5097 
5098       team->t.t_size_changed = 1;
5099 
5100 #if KMP_NESTED_HOT_TEAMS
5101       int avail_threads = hot_teams[level].hot_team_nth;
5102       if (new_nproc < avail_threads)
5103         avail_threads = new_nproc;
5104       kmp_info_t **other_threads = team->t.t_threads;
5105       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5106         // Adjust barrier data of reserved threads (if any) of the team
5107         // Other data will be set in __kmp_initialize_info() below.
5108         int b;
5109         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5110         for (b = 0; b < bs_last_barrier; ++b) {
5111           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5112           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5113 #if USE_DEBUGGER
5114           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5115 #endif
5116         }
5117       }
5118       if (hot_teams[level].hot_team_nth >= new_nproc) {
5119         // we have all needed threads in reserve, no need to allocate any
5120         // this only possible in mode 1, cannot have reserved threads in mode 0
5121         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5122         team->t.t_nproc = new_nproc; // just get reserved threads involved
5123       } else {
5124         // we may have some threads in reserve, but not enough
5125         team->t.t_nproc =
5126             hot_teams[level]
5127                 .hot_team_nth; // get reserved threads involved if any
5128         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5129 #endif // KMP_NESTED_HOT_TEAMS
5130         if (team->t.t_max_nproc < new_nproc) {
5131           /* reallocate larger arrays */
5132           __kmp_reallocate_team_arrays(team, new_nproc);
5133           __kmp_reinitialize_team(team, new_icvs, NULL);
5134         }
5135 
5136 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5137         /* Temporarily set full mask for master thread before creation of
5138            workers. The reason is that workers inherit the affinity from master,
5139            so if a lot of workers are created on the single core quickly, they
5140            don't get a chance to set their own affinity for a long time. */
5141         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5142 #endif
5143 
5144         /* allocate new threads for the hot team */
5145         for (f = team->t.t_nproc; f < new_nproc; f++) {
5146           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5147           KMP_DEBUG_ASSERT(new_worker);
5148           team->t.t_threads[f] = new_worker;
5149 
5150           KA_TRACE(20,
5151                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5152                     "join=%llu, plain=%llu\n",
5153                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5154                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5155                     team->t.t_bar[bs_plain_barrier].b_arrived));
5156 
5157           { // Initialize barrier data for new threads.
5158             int b;
5159             kmp_balign_t *balign = new_worker->th.th_bar;
5160             for (b = 0; b < bs_last_barrier; ++b) {
5161               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5162               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5163                                KMP_BARRIER_PARENT_FLAG);
5164 #if USE_DEBUGGER
5165               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5166 #endif
5167             }
5168           }
5169         }
5170 
5171 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5172         if (KMP_AFFINITY_CAPABLE()) {
5173           /* Restore initial master thread's affinity mask */
5174           __kmp_set_system_affinity(old_mask, TRUE);
5175           KMP_CPU_FREE(old_mask);
5176         }
5177 #endif
5178 #if KMP_NESTED_HOT_TEAMS
5179       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5180 #endif // KMP_NESTED_HOT_TEAMS
5181       /* make sure everyone is syncronized */
5182       int old_nproc = team->t.t_nproc; // save old value and use to update only
5183       // new threads below
5184       __kmp_initialize_team(team, new_nproc, new_icvs,
5185                             root->r.r_uber_thread->th.th_ident);
5186 
5187       /* reinitialize the threads */
5188       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5189       for (f = 0; f < team->t.t_nproc; ++f)
5190         __kmp_initialize_info(team->t.t_threads[f], team, f,
5191                               __kmp_gtid_from_tid(f, team));
5192 
5193       if (level) { // set th_task_state for new threads in nested hot team
5194         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5195         // only need to set the th_task_state for the new threads. th_task_state
5196         // for master thread will not be accurate until after this in
5197         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5198         // correct value.
5199         for (f = old_nproc; f < team->t.t_nproc; ++f)
5200           team->t.t_threads[f]->th.th_task_state =
5201               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5202       } else { // set th_task_state for new threads in non-nested hot team
5203         int old_state =
5204             team->t.t_threads[0]->th.th_task_state; // copy master's state
5205         for (f = old_nproc; f < team->t.t_nproc; ++f)
5206           team->t.t_threads[f]->th.th_task_state = old_state;
5207       }
5208 
5209 #ifdef KMP_DEBUG
5210       for (f = 0; f < team->t.t_nproc; ++f) {
5211         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5212                          team->t.t_threads[f]->th.th_team_nproc ==
5213                              team->t.t_nproc);
5214       }
5215 #endif
5216 
5217       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5218 #if KMP_AFFINITY_SUPPORTED
5219       __kmp_partition_places(team);
5220 #endif
5221     } // Check changes in number of threads
5222 
5223     kmp_info_t *master = team->t.t_threads[0];
5224     if (master->th.th_teams_microtask) {
5225       for (f = 1; f < new_nproc; ++f) {
5226         // propagate teams construct specific info to workers
5227         kmp_info_t *thr = team->t.t_threads[f];
5228         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5229         thr->th.th_teams_level = master->th.th_teams_level;
5230         thr->th.th_teams_size = master->th.th_teams_size;
5231       }
5232     }
5233 #if KMP_NESTED_HOT_TEAMS
5234     if (level) {
5235       // Sync barrier state for nested hot teams, not needed for outermost hot
5236       // team.
5237       for (f = 1; f < new_nproc; ++f) {
5238         kmp_info_t *thr = team->t.t_threads[f];
5239         int b;
5240         kmp_balign_t *balign = thr->th.th_bar;
5241         for (b = 0; b < bs_last_barrier; ++b) {
5242           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5243           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5244 #if USE_DEBUGGER
5245           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5246 #endif
5247         }
5248       }
5249     }
5250 #endif // KMP_NESTED_HOT_TEAMS
5251 
5252     /* reallocate space for arguments if necessary */
5253     __kmp_alloc_argv_entries(argc, team, TRUE);
5254     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5255     // The hot team re-uses the previous task team,
5256     // if untouched during the previous release->gather phase.
5257 
5258     KF_TRACE(10, (" hot_team = %p\n", team));
5259 
5260 #if KMP_DEBUG
5261     if (__kmp_tasking_mode != tskm_immediate_exec) {
5262       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5263                     "task_team[1] = %p after reinit\n",
5264                     team->t.t_task_team[0], team->t.t_task_team[1]));
5265     }
5266 #endif
5267 
5268 #if OMPT_SUPPORT
5269     __ompt_team_assign_id(team, ompt_parallel_data);
5270 #endif
5271 
5272     KMP_MB();
5273 
5274     return team;
5275   }
5276 
5277   /* next, let's try to take one from the team pool */
5278   KMP_MB();
5279   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5280     /* TODO: consider resizing undersized teams instead of reaping them, now
5281        that we have a resizing mechanism */
5282     if (team->t.t_max_nproc >= max_nproc) {
5283       /* take this team from the team pool */
5284       __kmp_team_pool = team->t.t_next_pool;
5285 
5286       /* setup the team for fresh use */
5287       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5288 
5289       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5290                     "task_team[1] %p to NULL\n",
5291                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5292       team->t.t_task_team[0] = NULL;
5293       team->t.t_task_team[1] = NULL;
5294 
5295       /* reallocate space for arguments if necessary */
5296       __kmp_alloc_argv_entries(argc, team, TRUE);
5297       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5298 
5299       KA_TRACE(
5300           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5301                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5302       { // Initialize barrier data.
5303         int b;
5304         for (b = 0; b < bs_last_barrier; ++b) {
5305           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5306 #if USE_DEBUGGER
5307           team->t.t_bar[b].b_master_arrived = 0;
5308           team->t.t_bar[b].b_team_arrived = 0;
5309 #endif
5310         }
5311       }
5312 
5313       team->t.t_proc_bind = new_proc_bind;
5314 
5315       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5316                     team->t.t_id));
5317 
5318 #if OMPT_SUPPORT
5319       __ompt_team_assign_id(team, ompt_parallel_data);
5320 #endif
5321 
5322       KMP_MB();
5323 
5324       return team;
5325     }
5326 
5327     /* reap team if it is too small, then loop back and check the next one */
5328     // not sure if this is wise, but, will be redone during the hot-teams
5329     // rewrite.
5330     /* TODO: Use technique to find the right size hot-team, don't reap them */
5331     team = __kmp_reap_team(team);
5332     __kmp_team_pool = team;
5333   }
5334 
5335   /* nothing available in the pool, no matter, make a new team! */
5336   KMP_MB();
5337   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5338 
5339   /* and set it up */
5340   team->t.t_max_nproc = max_nproc;
5341   /* NOTE well, for some reason allocating one big buffer and dividing it up
5342      seems to really hurt performance a lot on the P4, so, let's not use this */
5343   __kmp_allocate_team_arrays(team, max_nproc);
5344 
5345   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5346   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5347 
5348   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5349                 "%p to NULL\n",
5350                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5351   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5352   // memory, no need to duplicate
5353   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5354   // memory, no need to duplicate
5355 
5356   if (__kmp_storage_map) {
5357     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5358   }
5359 
5360   /* allocate space for arguments */
5361   __kmp_alloc_argv_entries(argc, team, FALSE);
5362   team->t.t_argc = argc;
5363 
5364   KA_TRACE(20,
5365            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5366             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5367   { // Initialize barrier data.
5368     int b;
5369     for (b = 0; b < bs_last_barrier; ++b) {
5370       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5371 #if USE_DEBUGGER
5372       team->t.t_bar[b].b_master_arrived = 0;
5373       team->t.t_bar[b].b_team_arrived = 0;
5374 #endif
5375     }
5376   }
5377 
5378   team->t.t_proc_bind = new_proc_bind;
5379 
5380 #if OMPT_SUPPORT
5381   __ompt_team_assign_id(team, ompt_parallel_data);
5382   team->t.ompt_serialized_team_info = NULL;
5383 #endif
5384 
5385   KMP_MB();
5386 
5387   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5388                 team->t.t_id));
5389 
5390   return team;
5391 }
5392 
5393 /* TODO implement hot-teams at all levels */
5394 /* TODO implement lazy thread release on demand (disband request) */
5395 
5396 /* free the team.  return it to the team pool.  release all the threads
5397  * associated with it */
5398 void __kmp_free_team(kmp_root_t *root,
5399                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5400   int f;
5401   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5402                 team->t.t_id));
5403 
5404   /* verify state */
5405   KMP_DEBUG_ASSERT(root);
5406   KMP_DEBUG_ASSERT(team);
5407   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5408   KMP_DEBUG_ASSERT(team->t.t_threads);
5409 
5410   int use_hot_team = team == root->r.r_hot_team;
5411 #if KMP_NESTED_HOT_TEAMS
5412   int level;
5413   kmp_hot_team_ptr_t *hot_teams;
5414   if (master) {
5415     level = team->t.t_active_level - 1;
5416     if (master->th.th_teams_microtask) { // in teams construct?
5417       if (master->th.th_teams_size.nteams > 1) {
5418         ++level; // level was not increased in teams construct for
5419         // team_of_masters
5420       }
5421       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5422           master->th.th_teams_level == team->t.t_level) {
5423         ++level; // level was not increased in teams construct for
5424         // team_of_workers before the parallel
5425       } // team->t.t_level will be increased inside parallel
5426     }
5427     hot_teams = master->th.th_hot_teams;
5428     if (level < __kmp_hot_teams_max_level) {
5429       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5430       use_hot_team = 1;
5431     }
5432   }
5433 #endif // KMP_NESTED_HOT_TEAMS
5434 
5435   /* team is done working */
5436   TCW_SYNC_PTR(team->t.t_pkfn,
5437                NULL); // Important for Debugging Support Library.
5438 #if KMP_OS_WINDOWS
5439   team->t.t_copyin_counter = 0; // init counter for possible reuse
5440 #endif
5441   // Do not reset pointer to parent team to NULL for hot teams.
5442 
5443   /* if we are non-hot team, release our threads */
5444   if (!use_hot_team) {
5445     if (__kmp_tasking_mode != tskm_immediate_exec) {
5446       // Wait for threads to reach reapable state
5447       for (f = 1; f < team->t.t_nproc; ++f) {
5448         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5449         kmp_info_t *th = team->t.t_threads[f];
5450         volatile kmp_uint32 *state = &th->th.th_reap_state;
5451         while (*state != KMP_SAFE_TO_REAP) {
5452 #if KMP_OS_WINDOWS
5453           // On Windows a thread can be killed at any time, check this
5454           DWORD ecode;
5455           if (!__kmp_is_thread_alive(th, &ecode)) {
5456             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5457             break;
5458           }
5459 #endif
5460           // first check if thread is sleeping
5461           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5462           if (fl.is_sleeping())
5463             fl.resume(__kmp_gtid_from_thread(th));
5464           KMP_CPU_PAUSE();
5465         }
5466       }
5467 
5468       // Delete task teams
5469       int tt_idx;
5470       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5471         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5472         if (task_team != NULL) {
5473           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5474             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5475             team->t.t_threads[f]->th.th_task_team = NULL;
5476           }
5477           KA_TRACE(
5478               20,
5479               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5480                __kmp_get_gtid(), task_team, team->t.t_id));
5481 #if KMP_NESTED_HOT_TEAMS
5482           __kmp_free_task_team(master, task_team);
5483 #endif
5484           team->t.t_task_team[tt_idx] = NULL;
5485         }
5486       }
5487     }
5488 
5489     // Reset pointer to parent team only for non-hot teams.
5490     team->t.t_parent = NULL;
5491     team->t.t_level = 0;
5492     team->t.t_active_level = 0;
5493 
5494     /* free the worker threads */
5495     for (f = 1; f < team->t.t_nproc; ++f) {
5496       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5497       __kmp_free_thread(team->t.t_threads[f]);
5498       team->t.t_threads[f] = NULL;
5499     }
5500 
5501     /* put the team back in the team pool */
5502     /* TODO limit size of team pool, call reap_team if pool too large */
5503     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5504     __kmp_team_pool = (volatile kmp_team_t *)team;
5505   } else { // Check if team was created for the masters in a teams construct
5506     // See if first worker is a CG root
5507     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5508                      team->t.t_threads[1]->th.th_cg_roots);
5509     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5510       // Clean up the CG root nodes on workers so that this team can be re-used
5511       for (f = 1; f < team->t.t_nproc; ++f) {
5512         kmp_info_t *thr = team->t.t_threads[f];
5513         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5514                          thr->th.th_cg_roots->cg_root == thr);
5515         // Pop current CG root off list
5516         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5517         thr->th.th_cg_roots = tmp->up;
5518         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5519                        " up to node %p. cg_nthreads was %d\n",
5520                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5521         int i = tmp->cg_nthreads--;
5522         if (i == 1) {
5523           __kmp_free(tmp); // free CG if we are the last thread in it
5524         }
5525         // Restore current task's thread_limit from CG root
5526         if (thr->th.th_cg_roots)
5527           thr->th.th_current_task->td_icvs.thread_limit =
5528               thr->th.th_cg_roots->cg_thread_limit;
5529       }
5530     }
5531   }
5532 
5533   KMP_MB();
5534 }
5535 
5536 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5537 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5538   kmp_team_t *next_pool = team->t.t_next_pool;
5539 
5540   KMP_DEBUG_ASSERT(team);
5541   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5542   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5543   KMP_DEBUG_ASSERT(team->t.t_threads);
5544   KMP_DEBUG_ASSERT(team->t.t_argv);
5545 
5546   /* TODO clean the threads that are a part of this? */
5547 
5548   /* free stuff */
5549   __kmp_free_team_arrays(team);
5550   if (team->t.t_argv != &team->t.t_inline_argv[0])
5551     __kmp_free((void *)team->t.t_argv);
5552   __kmp_free(team);
5553 
5554   KMP_MB();
5555   return next_pool;
5556 }
5557 
5558 // Free the thread.  Don't reap it, just place it on the pool of available
5559 // threads.
5560 //
5561 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5562 // binding for the affinity mechanism to be useful.
5563 //
5564 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5565 // However, we want to avoid a potential performance problem by always
5566 // scanning through the list to find the correct point at which to insert
5567 // the thread (potential N**2 behavior).  To do this we keep track of the
5568 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5569 // With single-level parallelism, threads will always be added to the tail
5570 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5571 // parallelism, all bets are off and we may need to scan through the entire
5572 // free list.
5573 //
5574 // This change also has a potentially large performance benefit, for some
5575 // applications.  Previously, as threads were freed from the hot team, they
5576 // would be placed back on the free list in inverse order.  If the hot team
5577 // grew back to it's original size, then the freed thread would be placed
5578 // back on the hot team in reverse order.  This could cause bad cache
5579 // locality problems on programs where the size of the hot team regularly
5580 // grew and shrunk.
5581 //
5582 // Now, for single-level parallelism, the OMP tid is always == gtid.
5583 void __kmp_free_thread(kmp_info_t *this_th) {
5584   int gtid;
5585   kmp_info_t **scan;
5586 
5587   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5588                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5589 
5590   KMP_DEBUG_ASSERT(this_th);
5591 
5592   // When moving thread to pool, switch thread to wait on own b_go flag, and
5593   // uninitialized (NULL team).
5594   int b;
5595   kmp_balign_t *balign = this_th->th.th_bar;
5596   for (b = 0; b < bs_last_barrier; ++b) {
5597     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5598       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5599     balign[b].bb.team = NULL;
5600     balign[b].bb.leaf_kids = 0;
5601   }
5602   this_th->th.th_task_state = 0;
5603   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5604 
5605   /* put thread back on the free pool */
5606   TCW_PTR(this_th->th.th_team, NULL);
5607   TCW_PTR(this_th->th.th_root, NULL);
5608   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5609 
5610   while (this_th->th.th_cg_roots) {
5611     this_th->th.th_cg_roots->cg_nthreads--;
5612     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5613                    " %p of thread  %p to %d\n",
5614                    this_th, this_th->th.th_cg_roots,
5615                    this_th->th.th_cg_roots->cg_root,
5616                    this_th->th.th_cg_roots->cg_nthreads));
5617     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5618     if (tmp->cg_root == this_th) { // Thread is a cg_root
5619       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5620       KA_TRACE(
5621           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5622       this_th->th.th_cg_roots = tmp->up;
5623       __kmp_free(tmp);
5624     } else { // Worker thread
5625       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5626         __kmp_free(tmp);
5627       }
5628       this_th->th.th_cg_roots = NULL;
5629       break;
5630     }
5631   }
5632 
5633   /* If the implicit task assigned to this thread can be used by other threads
5634    * -> multiple threads can share the data and try to free the task at
5635    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5636    * with higher probability when hot team is disabled but can occurs even when
5637    * the hot team is enabled */
5638   __kmp_free_implicit_task(this_th);
5639   this_th->th.th_current_task = NULL;
5640 
5641   // If the __kmp_thread_pool_insert_pt is already past the new insert
5642   // point, then we need to re-scan the entire list.
5643   gtid = this_th->th.th_info.ds.ds_gtid;
5644   if (__kmp_thread_pool_insert_pt != NULL) {
5645     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5646     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5647       __kmp_thread_pool_insert_pt = NULL;
5648     }
5649   }
5650 
5651   // Scan down the list to find the place to insert the thread.
5652   // scan is the address of a link in the list, possibly the address of
5653   // __kmp_thread_pool itself.
5654   //
5655   // In the absence of nested parallelism, the for loop will have 0 iterations.
5656   if (__kmp_thread_pool_insert_pt != NULL) {
5657     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5658   } else {
5659     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5660   }
5661   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5662        scan = &((*scan)->th.th_next_pool))
5663     ;
5664 
5665   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5666   // to its address.
5667   TCW_PTR(this_th->th.th_next_pool, *scan);
5668   __kmp_thread_pool_insert_pt = *scan = this_th;
5669   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5670                    (this_th->th.th_info.ds.ds_gtid <
5671                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5672   TCW_4(this_th->th.th_in_pool, TRUE);
5673   __kmp_suspend_initialize_thread(this_th);
5674   __kmp_lock_suspend_mx(this_th);
5675   if (this_th->th.th_active == TRUE) {
5676     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5677     this_th->th.th_active_in_pool = TRUE;
5678   }
5679 #if KMP_DEBUG
5680   else {
5681     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5682   }
5683 #endif
5684   __kmp_unlock_suspend_mx(this_th);
5685 
5686   TCW_4(__kmp_nth, __kmp_nth - 1);
5687 
5688 #ifdef KMP_ADJUST_BLOCKTIME
5689   /* Adjust blocktime back to user setting or default if necessary */
5690   /* Middle initialization might never have occurred                */
5691   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5692     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5693     if (__kmp_nth <= __kmp_avail_proc) {
5694       __kmp_zero_bt = FALSE;
5695     }
5696   }
5697 #endif /* KMP_ADJUST_BLOCKTIME */
5698 
5699   KMP_MB();
5700 }
5701 
5702 /* ------------------------------------------------------------------------ */
5703 
5704 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5705   int gtid = this_thr->th.th_info.ds.ds_gtid;
5706   /*    void                 *stack_data;*/
5707   kmp_team_t **volatile pteam;
5708 
5709   KMP_MB();
5710   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5711 
5712   if (__kmp_env_consistency_check) {
5713     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5714   }
5715 
5716 #if OMPT_SUPPORT
5717   ompt_data_t *thread_data;
5718   if (ompt_enabled.enabled) {
5719     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5720     *thread_data = ompt_data_none;
5721 
5722     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5723     this_thr->th.ompt_thread_info.wait_id = 0;
5724     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5725     this_thr->th.ompt_thread_info.parallel_flags = 0;
5726     if (ompt_enabled.ompt_callback_thread_begin) {
5727       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5728           ompt_thread_worker, thread_data);
5729     }
5730     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5731   }
5732 #endif
5733 
5734   /* This is the place where threads wait for work */
5735   while (!TCR_4(__kmp_global.g.g_done)) {
5736     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5737     KMP_MB();
5738 
5739     /* wait for work to do */
5740     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5741 
5742     /* No tid yet since not part of a team */
5743     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5744 
5745 #if OMPT_SUPPORT
5746     if (ompt_enabled.enabled) {
5747       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5748     }
5749 #endif
5750 
5751     pteam = &this_thr->th.th_team;
5752 
5753     /* have we been allocated? */
5754     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5755       /* we were just woken up, so run our new task */
5756       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5757         int rc;
5758         KA_TRACE(20,
5759                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5760                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5761                   (*pteam)->t.t_pkfn));
5762 
5763         updateHWFPControl(*pteam);
5764 
5765 #if OMPT_SUPPORT
5766         if (ompt_enabled.enabled) {
5767           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5768         }
5769 #endif
5770 
5771         rc = (*pteam)->t.t_invoke(gtid);
5772         KMP_ASSERT(rc);
5773 
5774         KMP_MB();
5775         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5776                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5777                       (*pteam)->t.t_pkfn));
5778       }
5779 #if OMPT_SUPPORT
5780       if (ompt_enabled.enabled) {
5781         /* no frame set while outside task */
5782         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5783 
5784         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5785       }
5786 #endif
5787       /* join barrier after parallel region */
5788       __kmp_join_barrier(gtid);
5789     }
5790   }
5791   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5792 
5793 #if OMPT_SUPPORT
5794   if (ompt_enabled.ompt_callback_thread_end) {
5795     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5796   }
5797 #endif
5798 
5799   this_thr->th.th_task_team = NULL;
5800   /* run the destructors for the threadprivate data for this thread */
5801   __kmp_common_destroy_gtid(gtid);
5802 
5803   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5804   KMP_MB();
5805   return this_thr;
5806 }
5807 
5808 /* ------------------------------------------------------------------------ */
5809 
5810 void __kmp_internal_end_dest(void *specific_gtid) {
5811 #if KMP_COMPILER_ICC
5812 #pragma warning(push)
5813 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5814 // significant bits
5815 #endif
5816   // Make sure no significant bits are lost
5817   int gtid = (kmp_intptr_t)specific_gtid - 1;
5818 #if KMP_COMPILER_ICC
5819 #pragma warning(pop)
5820 #endif
5821 
5822   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5823   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5824    * this is because 0 is reserved for the nothing-stored case */
5825 
5826   __kmp_internal_end_thread(gtid);
5827 }
5828 
5829 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5830 
5831 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5832   __kmp_internal_end_atexit();
5833 }
5834 
5835 #endif
5836 
5837 /* [Windows] josh: when the atexit handler is called, there may still be more
5838    than one thread alive */
5839 void __kmp_internal_end_atexit(void) {
5840   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5841   /* [Windows]
5842      josh: ideally, we want to completely shutdown the library in this atexit
5843      handler, but stat code that depends on thread specific data for gtid fails
5844      because that data becomes unavailable at some point during the shutdown, so
5845      we call __kmp_internal_end_thread instead. We should eventually remove the
5846      dependency on __kmp_get_specific_gtid in the stat code and use
5847      __kmp_internal_end_library to cleanly shutdown the library.
5848 
5849      // TODO: Can some of this comment about GVS be removed?
5850      I suspect that the offending stat code is executed when the calling thread
5851      tries to clean up a dead root thread's data structures, resulting in GVS
5852      code trying to close the GVS structures for that thread, but since the stat
5853      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5854      the calling thread is cleaning up itself instead of another thread, it get
5855      confused. This happens because allowing a thread to unregister and cleanup
5856      another thread is a recent modification for addressing an issue.
5857      Based on the current design (20050722), a thread may end up
5858      trying to unregister another thread only if thread death does not trigger
5859      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5860      thread specific data destructor function to detect thread death. For
5861      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5862      is nothing.  Thus, the workaround is applicable only for Windows static
5863      stat library. */
5864   __kmp_internal_end_library(-1);
5865 #if KMP_OS_WINDOWS
5866   __kmp_close_console();
5867 #endif
5868 }
5869 
5870 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5871   // It is assumed __kmp_forkjoin_lock is acquired.
5872 
5873   int gtid;
5874 
5875   KMP_DEBUG_ASSERT(thread != NULL);
5876 
5877   gtid = thread->th.th_info.ds.ds_gtid;
5878 
5879   if (!is_root) {
5880     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5881       /* Assume the threads are at the fork barrier here */
5882       KA_TRACE(
5883           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5884                gtid));
5885       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5886        * (GEH) */
5887       ANNOTATE_HAPPENS_BEFORE(thread);
5888       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5889       __kmp_release_64(&flag);
5890     }
5891 
5892     // Terminate OS thread.
5893     __kmp_reap_worker(thread);
5894 
5895     // The thread was killed asynchronously.  If it was actively
5896     // spinning in the thread pool, decrement the global count.
5897     //
5898     // There is a small timing hole here - if the worker thread was just waking
5899     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5900     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5901     // the global counter might not get updated.
5902     //
5903     // Currently, this can only happen as the library is unloaded,
5904     // so there are no harmful side effects.
5905     if (thread->th.th_active_in_pool) {
5906       thread->th.th_active_in_pool = FALSE;
5907       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5908       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5909     }
5910   }
5911 
5912   __kmp_free_implicit_task(thread);
5913 
5914 // Free the fast memory for tasking
5915 #if USE_FAST_MEMORY
5916   __kmp_free_fast_memory(thread);
5917 #endif /* USE_FAST_MEMORY */
5918 
5919   __kmp_suspend_uninitialize_thread(thread);
5920 
5921   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5922   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5923 
5924   --__kmp_all_nth;
5925 // __kmp_nth was decremented when thread is added to the pool.
5926 
5927 #ifdef KMP_ADJUST_BLOCKTIME
5928   /* Adjust blocktime back to user setting or default if necessary */
5929   /* Middle initialization might never have occurred                */
5930   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5931     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5932     if (__kmp_nth <= __kmp_avail_proc) {
5933       __kmp_zero_bt = FALSE;
5934     }
5935   }
5936 #endif /* KMP_ADJUST_BLOCKTIME */
5937 
5938   /* free the memory being used */
5939   if (__kmp_env_consistency_check) {
5940     if (thread->th.th_cons) {
5941       __kmp_free_cons_stack(thread->th.th_cons);
5942       thread->th.th_cons = NULL;
5943     }
5944   }
5945 
5946   if (thread->th.th_pri_common != NULL) {
5947     __kmp_free(thread->th.th_pri_common);
5948     thread->th.th_pri_common = NULL;
5949   }
5950 
5951   if (thread->th.th_task_state_memo_stack != NULL) {
5952     __kmp_free(thread->th.th_task_state_memo_stack);
5953     thread->th.th_task_state_memo_stack = NULL;
5954   }
5955 
5956 #if KMP_USE_BGET
5957   if (thread->th.th_local.bget_data != NULL) {
5958     __kmp_finalize_bget(thread);
5959   }
5960 #endif
5961 
5962 #if KMP_AFFINITY_SUPPORTED
5963   if (thread->th.th_affin_mask != NULL) {
5964     KMP_CPU_FREE(thread->th.th_affin_mask);
5965     thread->th.th_affin_mask = NULL;
5966   }
5967 #endif /* KMP_AFFINITY_SUPPORTED */
5968 
5969 #if KMP_USE_HIER_SCHED
5970   if (thread->th.th_hier_bar_data != NULL) {
5971     __kmp_free(thread->th.th_hier_bar_data);
5972     thread->th.th_hier_bar_data = NULL;
5973   }
5974 #endif
5975 
5976   __kmp_reap_team(thread->th.th_serial_team);
5977   thread->th.th_serial_team = NULL;
5978   __kmp_free(thread);
5979 
5980   KMP_MB();
5981 
5982 } // __kmp_reap_thread
5983 
5984 static void __kmp_internal_end(void) {
5985   int i;
5986 
5987   /* First, unregister the library */
5988   __kmp_unregister_library();
5989 
5990 #if KMP_OS_WINDOWS
5991   /* In Win static library, we can't tell when a root actually dies, so we
5992      reclaim the data structures for any root threads that have died but not
5993      unregistered themselves, in order to shut down cleanly.
5994      In Win dynamic library we also can't tell when a thread dies.  */
5995   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5996 // dead roots
5997 #endif
5998 
5999   for (i = 0; i < __kmp_threads_capacity; i++)
6000     if (__kmp_root[i])
6001       if (__kmp_root[i]->r.r_active)
6002         break;
6003   KMP_MB(); /* Flush all pending memory write invalidates.  */
6004   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6005 
6006   if (i < __kmp_threads_capacity) {
6007 #if KMP_USE_MONITOR
6008     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6009     KMP_MB(); /* Flush all pending memory write invalidates.  */
6010 
6011     // Need to check that monitor was initialized before reaping it. If we are
6012     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6013     // __kmp_monitor will appear to contain valid data, but it is only valid in
6014     // the parent process, not the child.
6015     // New behavior (201008): instead of keying off of the flag
6016     // __kmp_init_parallel, the monitor thread creation is keyed off
6017     // of the new flag __kmp_init_monitor.
6018     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6019     if (TCR_4(__kmp_init_monitor)) {
6020       __kmp_reap_monitor(&__kmp_monitor);
6021       TCW_4(__kmp_init_monitor, 0);
6022     }
6023     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6024     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6025 #endif // KMP_USE_MONITOR
6026   } else {
6027 /* TODO move this to cleanup code */
6028 #ifdef KMP_DEBUG
6029     /* make sure that everything has properly ended */
6030     for (i = 0; i < __kmp_threads_capacity; i++) {
6031       if (__kmp_root[i]) {
6032         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6033         //                    there can be uber threads alive here
6034         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6035       }
6036     }
6037 #endif
6038 
6039     KMP_MB();
6040 
6041     // Reap the worker threads.
6042     // This is valid for now, but be careful if threads are reaped sooner.
6043     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6044       // Get the next thread from the pool.
6045       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6046       __kmp_thread_pool = thread->th.th_next_pool;
6047       // Reap it.
6048       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6049       thread->th.th_next_pool = NULL;
6050       thread->th.th_in_pool = FALSE;
6051       __kmp_reap_thread(thread, 0);
6052     }
6053     __kmp_thread_pool_insert_pt = NULL;
6054 
6055     // Reap teams.
6056     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6057       // Get the next team from the pool.
6058       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6059       __kmp_team_pool = team->t.t_next_pool;
6060       // Reap it.
6061       team->t.t_next_pool = NULL;
6062       __kmp_reap_team(team);
6063     }
6064 
6065     __kmp_reap_task_teams();
6066 
6067 #if KMP_OS_UNIX
6068     // Threads that are not reaped should not access any resources since they
6069     // are going to be deallocated soon, so the shutdown sequence should wait
6070     // until all threads either exit the final spin-waiting loop or begin
6071     // sleeping after the given blocktime.
6072     for (i = 0; i < __kmp_threads_capacity; i++) {
6073       kmp_info_t *thr = __kmp_threads[i];
6074       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6075         KMP_CPU_PAUSE();
6076     }
6077 #endif
6078 
6079     for (i = 0; i < __kmp_threads_capacity; ++i) {
6080       // TBD: Add some checking...
6081       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6082     }
6083 
6084     /* Make sure all threadprivate destructors get run by joining with all
6085        worker threads before resetting this flag */
6086     TCW_SYNC_4(__kmp_init_common, FALSE);
6087 
6088     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6089     KMP_MB();
6090 
6091 #if KMP_USE_MONITOR
6092     // See note above: One of the possible fixes for CQ138434 / CQ140126
6093     //
6094     // FIXME: push both code fragments down and CSE them?
6095     // push them into __kmp_cleanup() ?
6096     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6097     if (TCR_4(__kmp_init_monitor)) {
6098       __kmp_reap_monitor(&__kmp_monitor);
6099       TCW_4(__kmp_init_monitor, 0);
6100     }
6101     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6102     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6103 #endif
6104   } /* else !__kmp_global.t_active */
6105   TCW_4(__kmp_init_gtid, FALSE);
6106   KMP_MB(); /* Flush all pending memory write invalidates.  */
6107 
6108   __kmp_cleanup();
6109 #if OMPT_SUPPORT
6110   ompt_fini();
6111 #endif
6112 }
6113 
6114 void __kmp_internal_end_library(int gtid_req) {
6115   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6116   /* this shouldn't be a race condition because __kmp_internal_end() is the
6117      only place to clear __kmp_serial_init */
6118   /* we'll check this later too, after we get the lock */
6119   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6120   // redundant, because the next check will work in any case.
6121   if (__kmp_global.g.g_abort) {
6122     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6123     /* TODO abort? */
6124     return;
6125   }
6126   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6127     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6128     return;
6129   }
6130 
6131   KMP_MB(); /* Flush all pending memory write invalidates.  */
6132   /* find out who we are and what we should do */
6133   {
6134     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6135     KA_TRACE(
6136         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6137     if (gtid == KMP_GTID_SHUTDOWN) {
6138       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6139                     "already shutdown\n"));
6140       return;
6141     } else if (gtid == KMP_GTID_MONITOR) {
6142       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6143                     "registered, or system shutdown\n"));
6144       return;
6145     } else if (gtid == KMP_GTID_DNE) {
6146       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6147                     "shutdown\n"));
6148       /* we don't know who we are, but we may still shutdown the library */
6149     } else if (KMP_UBER_GTID(gtid)) {
6150       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6151       if (__kmp_root[gtid]->r.r_active) {
6152         __kmp_global.g.g_abort = -1;
6153         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6154         __kmp_unregister_library();
6155         KA_TRACE(10,
6156                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6157                   gtid));
6158         return;
6159       } else {
6160         KA_TRACE(
6161             10,
6162             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6163         __kmp_unregister_root_current_thread(gtid);
6164       }
6165     } else {
6166 /* worker threads may call this function through the atexit handler, if they
6167  * call exit() */
6168 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6169    TODO: do a thorough shutdown instead */
6170 #ifdef DUMP_DEBUG_ON_EXIT
6171       if (__kmp_debug_buf)
6172         __kmp_dump_debug_buffer();
6173 #endif
6174       // added unregister library call here when we switch to shm linux
6175       // if we don't, it will leave lots of files in /dev/shm
6176       // cleanup shared memory file before exiting.
6177       __kmp_unregister_library();
6178       return;
6179     }
6180   }
6181   /* synchronize the termination process */
6182   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6183 
6184   /* have we already finished */
6185   if (__kmp_global.g.g_abort) {
6186     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6187     /* TODO abort? */
6188     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6189     return;
6190   }
6191   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6192     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6193     return;
6194   }
6195 
6196   /* We need this lock to enforce mutex between this reading of
6197      __kmp_threads_capacity and the writing by __kmp_register_root.
6198      Alternatively, we can use a counter of roots that is atomically updated by
6199      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6200      __kmp_internal_end_*.  */
6201   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6202 
6203   /* now we can safely conduct the actual termination */
6204   __kmp_internal_end();
6205 
6206   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6207   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6208 
6209   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6210 
6211 #ifdef DUMP_DEBUG_ON_EXIT
6212   if (__kmp_debug_buf)
6213     __kmp_dump_debug_buffer();
6214 #endif
6215 
6216 #if KMP_OS_WINDOWS
6217   __kmp_close_console();
6218 #endif
6219 
6220   __kmp_fini_allocator();
6221 
6222 } // __kmp_internal_end_library
6223 
6224 void __kmp_internal_end_thread(int gtid_req) {
6225   int i;
6226 
6227   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6228   /* this shouldn't be a race condition because __kmp_internal_end() is the
6229    * only place to clear __kmp_serial_init */
6230   /* we'll check this later too, after we get the lock */
6231   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6232   // redundant, because the next check will work in any case.
6233   if (__kmp_global.g.g_abort) {
6234     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6235     /* TODO abort? */
6236     return;
6237   }
6238   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6239     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6240     return;
6241   }
6242 
6243   KMP_MB(); /* Flush all pending memory write invalidates.  */
6244 
6245   /* find out who we are and what we should do */
6246   {
6247     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6248     KA_TRACE(10,
6249              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6250     if (gtid == KMP_GTID_SHUTDOWN) {
6251       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6252                     "already shutdown\n"));
6253       return;
6254     } else if (gtid == KMP_GTID_MONITOR) {
6255       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6256                     "registered, or system shutdown\n"));
6257       return;
6258     } else if (gtid == KMP_GTID_DNE) {
6259       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6260                     "shutdown\n"));
6261       return;
6262       /* we don't know who we are */
6263     } else if (KMP_UBER_GTID(gtid)) {
6264       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6265       if (__kmp_root[gtid]->r.r_active) {
6266         __kmp_global.g.g_abort = -1;
6267         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6268         KA_TRACE(10,
6269                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6270                   gtid));
6271         return;
6272       } else {
6273         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6274                       gtid));
6275         __kmp_unregister_root_current_thread(gtid);
6276       }
6277     } else {
6278       /* just a worker thread, let's leave */
6279       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6280 
6281       if (gtid >= 0) {
6282         __kmp_threads[gtid]->th.th_task_team = NULL;
6283       }
6284 
6285       KA_TRACE(10,
6286                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6287                 gtid));
6288       return;
6289     }
6290   }
6291 #if KMP_DYNAMIC_LIB
6292   if (__kmp_pause_status != kmp_hard_paused)
6293   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6294   // because we will better shutdown later in the library destructor.
6295   {
6296     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6297     return;
6298   }
6299 #endif
6300   /* synchronize the termination process */
6301   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6302 
6303   /* have we already finished */
6304   if (__kmp_global.g.g_abort) {
6305     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6306     /* TODO abort? */
6307     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6308     return;
6309   }
6310   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6311     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6312     return;
6313   }
6314 
6315   /* We need this lock to enforce mutex between this reading of
6316      __kmp_threads_capacity and the writing by __kmp_register_root.
6317      Alternatively, we can use a counter of roots that is atomically updated by
6318      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6319      __kmp_internal_end_*.  */
6320 
6321   /* should we finish the run-time?  are all siblings done? */
6322   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6323 
6324   for (i = 0; i < __kmp_threads_capacity; ++i) {
6325     if (KMP_UBER_GTID(i)) {
6326       KA_TRACE(
6327           10,
6328           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6329       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6330       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6331       return;
6332     }
6333   }
6334 
6335   /* now we can safely conduct the actual termination */
6336 
6337   __kmp_internal_end();
6338 
6339   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6340   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6341 
6342   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6343 
6344 #ifdef DUMP_DEBUG_ON_EXIT
6345   if (__kmp_debug_buf)
6346     __kmp_dump_debug_buffer();
6347 #endif
6348 } // __kmp_internal_end_thread
6349 
6350 // -----------------------------------------------------------------------------
6351 // Library registration stuff.
6352 
6353 static long __kmp_registration_flag = 0;
6354 // Random value used to indicate library initialization.
6355 static char *__kmp_registration_str = NULL;
6356 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6357 
6358 static inline char *__kmp_reg_status_name() {
6359   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6360      each thread. If registration and unregistration go in different threads
6361      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6362      env var can not be found, because the name will contain different pid. */
6363   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6364 } // __kmp_reg_status_get
6365 
6366 void __kmp_register_library_startup(void) {
6367 
6368   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6369   int done = 0;
6370   union {
6371     double dtime;
6372     long ltime;
6373   } time;
6374 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6375   __kmp_initialize_system_tick();
6376 #endif
6377   __kmp_read_system_time(&time.dtime);
6378   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6379   __kmp_registration_str =
6380       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6381                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6382 
6383   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6384                 __kmp_registration_str));
6385 
6386   while (!done) {
6387 
6388     char *value = NULL; // Actual value of the environment variable.
6389 
6390 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6391     char *shm_name = __kmp_str_format("/%s", name);
6392     int shm_preexist = 0;
6393     char *data1;
6394     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6395     if ((fd1 == -1) && (errno == EEXIST)) {
6396       // file didn't open because it already exists.
6397       // try opening existing file
6398       fd1 = shm_open(shm_name, O_RDWR, 0666);
6399       if (fd1 == -1) { // file didn't open
6400         // error out here
6401         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6402                     __kmp_msg_null);
6403       } else {
6404         // able to open existing file
6405         shm_preexist = 1;
6406       }
6407     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6408       // already exists.
6409       // error out here.
6410       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6411                   __kmp_msg_null);
6412     }
6413     if (shm_preexist == 0) {
6414       // we created SHM now set size
6415       if (ftruncate(fd1, SHM_SIZE) == -1) {
6416         // error occured setting size;
6417         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6418                     KMP_ERR(errno), __kmp_msg_null);
6419       }
6420     }
6421     data1 =
6422         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6423     if (data1 == MAP_FAILED) {
6424       // failed to map shared memory
6425       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6426                   __kmp_msg_null);
6427     }
6428     if (shm_preexist == 0) { // set data to SHM, set value
6429       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6430     }
6431     // Read value from either what we just wrote or existing file.
6432     value = __kmp_str_format("%s", data1); // read value from SHM
6433     munmap(data1, SHM_SIZE);
6434     close(fd1);
6435 #else // Windows and unix with static library
6436     // Set environment variable, but do not overwrite if it is exist.
6437     __kmp_env_set(name, __kmp_registration_str, 0);
6438     // read value to see if it got set
6439     value = __kmp_env_get(name);
6440 #endif
6441 
6442     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6443       done = 1; // Ok, environment variable set successfully, exit the loop.
6444     } else {
6445       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6446       // Check whether it alive or dead.
6447       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6448       char *tail = value;
6449       char *flag_addr_str = NULL;
6450       char *flag_val_str = NULL;
6451       char const *file_name = NULL;
6452       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6453       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6454       file_name = tail;
6455       if (tail != NULL) {
6456         long *flag_addr = 0;
6457         long flag_val = 0;
6458         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6459         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6460         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6461           // First, check whether environment-encoded address is mapped into
6462           // addr space.
6463           // If so, dereference it to see if it still has the right value.
6464           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6465             neighbor = 1;
6466           } else {
6467             // If not, then we know the other copy of the library is no longer
6468             // running.
6469             neighbor = 2;
6470           }
6471         }
6472       }
6473       switch (neighbor) {
6474       case 0: // Cannot parse environment variable -- neighbor status unknown.
6475         // Assume it is the incompatible format of future version of the
6476         // library. Assume the other library is alive.
6477         // WARN( ... ); // TODO: Issue a warning.
6478         file_name = "unknown library";
6479         KMP_FALLTHROUGH();
6480       // Attention! Falling to the next case. That's intentional.
6481       case 1: { // Neighbor is alive.
6482         // Check it is allowed.
6483         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6484         if (!__kmp_str_match_true(duplicate_ok)) {
6485           // That's not allowed. Issue fatal error.
6486           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6487                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6488         }
6489         KMP_INTERNAL_FREE(duplicate_ok);
6490         __kmp_duplicate_library_ok = 1;
6491         done = 1; // Exit the loop.
6492       } break;
6493       case 2: { // Neighbor is dead.
6494 
6495 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6496         // close shared memory.
6497         shm_unlink(shm_name); // this removes file in /dev/shm
6498 #else
6499         // Clear the variable and try to register library again.
6500         __kmp_env_unset(name);
6501 #endif
6502       } break;
6503       default: { KMP_DEBUG_ASSERT(0); } break;
6504       }
6505     }
6506     KMP_INTERNAL_FREE((void *)value);
6507 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6508     KMP_INTERNAL_FREE((void *)shm_name);
6509 #endif
6510   } // while
6511   KMP_INTERNAL_FREE((void *)name);
6512 
6513 } // func __kmp_register_library_startup
6514 
6515 void __kmp_unregister_library(void) {
6516 
6517   char *name = __kmp_reg_status_name();
6518   char *value = NULL;
6519 
6520 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6521   char *shm_name = __kmp_str_format("/%s", name);
6522   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6523   if (fd1 == -1) {
6524     // file did not open. return.
6525     return;
6526   }
6527   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6528   if (data1 != MAP_FAILED) {
6529     value = __kmp_str_format("%s", data1); // read value from SHM
6530     munmap(data1, SHM_SIZE);
6531   }
6532   close(fd1);
6533 #else
6534   value = __kmp_env_get(name);
6535 #endif
6536 
6537   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6538   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6539   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6540 //  Ok, this is our variable. Delete it.
6541 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6542     shm_unlink(shm_name); // this removes file in /dev/shm
6543 #else
6544     __kmp_env_unset(name);
6545 #endif
6546   }
6547 
6548 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6549   KMP_INTERNAL_FREE(shm_name);
6550 #endif
6551 
6552   KMP_INTERNAL_FREE(__kmp_registration_str);
6553   KMP_INTERNAL_FREE(value);
6554   KMP_INTERNAL_FREE(name);
6555 
6556   __kmp_registration_flag = 0;
6557   __kmp_registration_str = NULL;
6558 
6559 } // __kmp_unregister_library
6560 
6561 // End of Library registration stuff.
6562 // -----------------------------------------------------------------------------
6563 
6564 #if KMP_MIC_SUPPORTED
6565 
6566 static void __kmp_check_mic_type() {
6567   kmp_cpuid_t cpuid_state = {0};
6568   kmp_cpuid_t *cs_p = &cpuid_state;
6569   __kmp_x86_cpuid(1, 0, cs_p);
6570   // We don't support mic1 at the moment
6571   if ((cs_p->eax & 0xff0) == 0xB10) {
6572     __kmp_mic_type = mic2;
6573   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6574     __kmp_mic_type = mic3;
6575   } else {
6576     __kmp_mic_type = non_mic;
6577   }
6578 }
6579 
6580 #endif /* KMP_MIC_SUPPORTED */
6581 
6582 static void __kmp_do_serial_initialize(void) {
6583   int i, gtid;
6584   int size;
6585 
6586   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6587 
6588   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6589   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6590   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6591   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6592   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6593 
6594 #if OMPT_SUPPORT
6595   ompt_pre_init();
6596 #endif
6597 
6598   __kmp_validate_locks();
6599 
6600   /* Initialize internal memory allocator */
6601   __kmp_init_allocator();
6602 
6603   /* Register the library startup via an environment variable and check to see
6604      whether another copy of the library is already registered. */
6605 
6606   __kmp_register_library_startup();
6607 
6608   /* TODO reinitialization of library */
6609   if (TCR_4(__kmp_global.g.g_done)) {
6610     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6611   }
6612 
6613   __kmp_global.g.g_abort = 0;
6614   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6615 
6616 /* initialize the locks */
6617 #if KMP_USE_ADAPTIVE_LOCKS
6618 #if KMP_DEBUG_ADAPTIVE_LOCKS
6619   __kmp_init_speculative_stats();
6620 #endif
6621 #endif
6622 #if KMP_STATS_ENABLED
6623   __kmp_stats_init();
6624 #endif
6625   __kmp_init_lock(&__kmp_global_lock);
6626   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6627   __kmp_init_lock(&__kmp_debug_lock);
6628   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6629   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6630   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6631   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6632   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6633   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6634   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6635   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6636   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6637   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6638   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6639   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6640   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6641   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6642   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6643 #if KMP_USE_MONITOR
6644   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6645 #endif
6646   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6647 
6648   /* conduct initialization and initial setup of configuration */
6649 
6650   __kmp_runtime_initialize();
6651 
6652 #if KMP_MIC_SUPPORTED
6653   __kmp_check_mic_type();
6654 #endif
6655 
6656 // Some global variable initialization moved here from kmp_env_initialize()
6657 #ifdef KMP_DEBUG
6658   kmp_diag = 0;
6659 #endif
6660   __kmp_abort_delay = 0;
6661 
6662   // From __kmp_init_dflt_team_nth()
6663   /* assume the entire machine will be used */
6664   __kmp_dflt_team_nth_ub = __kmp_xproc;
6665   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6666     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6667   }
6668   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6669     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6670   }
6671   __kmp_max_nth = __kmp_sys_max_nth;
6672   __kmp_cg_max_nth = __kmp_sys_max_nth;
6673   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6674   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6675     __kmp_teams_max_nth = __kmp_sys_max_nth;
6676   }
6677 
6678   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6679   // part
6680   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6681 #if KMP_USE_MONITOR
6682   __kmp_monitor_wakeups =
6683       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6684   __kmp_bt_intervals =
6685       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6686 #endif
6687   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6688   __kmp_library = library_throughput;
6689   // From KMP_SCHEDULE initialization
6690   __kmp_static = kmp_sch_static_balanced;
6691 // AC: do not use analytical here, because it is non-monotonous
6692 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6693 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6694 // need to repeat assignment
6695 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6696 // bit control and barrier method control parts
6697 #if KMP_FAST_REDUCTION_BARRIER
6698 #define kmp_reduction_barrier_gather_bb ((int)1)
6699 #define kmp_reduction_barrier_release_bb ((int)1)
6700 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6701 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6702 #endif // KMP_FAST_REDUCTION_BARRIER
6703   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6704     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6705     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6706     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6707     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6708 #if KMP_FAST_REDUCTION_BARRIER
6709     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6710       // lin_64 ): hyper,1
6711       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6712       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6713       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6714       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6715     }
6716 #endif // KMP_FAST_REDUCTION_BARRIER
6717   }
6718 #if KMP_FAST_REDUCTION_BARRIER
6719 #undef kmp_reduction_barrier_release_pat
6720 #undef kmp_reduction_barrier_gather_pat
6721 #undef kmp_reduction_barrier_release_bb
6722 #undef kmp_reduction_barrier_gather_bb
6723 #endif // KMP_FAST_REDUCTION_BARRIER
6724 #if KMP_MIC_SUPPORTED
6725   if (__kmp_mic_type == mic2) { // KNC
6726     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6727     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6728     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6729         1; // forkjoin release
6730     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6731     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6732   }
6733 #if KMP_FAST_REDUCTION_BARRIER
6734   if (__kmp_mic_type == mic2) { // KNC
6735     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6736     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6737   }
6738 #endif // KMP_FAST_REDUCTION_BARRIER
6739 #endif // KMP_MIC_SUPPORTED
6740 
6741 // From KMP_CHECKS initialization
6742 #ifdef KMP_DEBUG
6743   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6744 #else
6745   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6746 #endif
6747 
6748   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6749   __kmp_foreign_tp = TRUE;
6750 
6751   __kmp_global.g.g_dynamic = FALSE;
6752   __kmp_global.g.g_dynamic_mode = dynamic_default;
6753 
6754   __kmp_env_initialize(NULL);
6755 
6756 // Print all messages in message catalog for testing purposes.
6757 #ifdef KMP_DEBUG
6758   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6759   if (__kmp_str_match_true(val)) {
6760     kmp_str_buf_t buffer;
6761     __kmp_str_buf_init(&buffer);
6762     __kmp_i18n_dump_catalog(&buffer);
6763     __kmp_printf("%s", buffer.str);
6764     __kmp_str_buf_free(&buffer);
6765   }
6766   __kmp_env_free(&val);
6767 #endif
6768 
6769   __kmp_threads_capacity =
6770       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6771   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6772   __kmp_tp_capacity = __kmp_default_tp_capacity(
6773       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6774 
6775   // If the library is shut down properly, both pools must be NULL. Just in
6776   // case, set them to NULL -- some memory may leak, but subsequent code will
6777   // work even if pools are not freed.
6778   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6779   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6780   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6781   __kmp_thread_pool = NULL;
6782   __kmp_thread_pool_insert_pt = NULL;
6783   __kmp_team_pool = NULL;
6784 
6785   /* Allocate all of the variable sized records */
6786   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6787    * expandable */
6788   /* Since allocation is cache-aligned, just add extra padding at the end */
6789   size =
6790       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6791       CACHE_LINE;
6792   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6793   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6794                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6795 
6796   /* init thread counts */
6797   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6798                    0); // Asserts fail if the library is reinitializing and
6799   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6800   __kmp_all_nth = 0;
6801   __kmp_nth = 0;
6802 
6803   /* setup the uber master thread and hierarchy */
6804   gtid = __kmp_register_root(TRUE);
6805   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6806   KMP_ASSERT(KMP_UBER_GTID(gtid));
6807   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6808 
6809   KMP_MB(); /* Flush all pending memory write invalidates.  */
6810 
6811   __kmp_common_initialize();
6812 
6813 #if KMP_OS_UNIX
6814   /* invoke the child fork handler */
6815   __kmp_register_atfork();
6816 #endif
6817 
6818 #if !KMP_DYNAMIC_LIB
6819   {
6820     /* Invoke the exit handler when the program finishes, only for static
6821        library. For dynamic library, we already have _fini and DllMain. */
6822     int rc = atexit(__kmp_internal_end_atexit);
6823     if (rc != 0) {
6824       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6825                   __kmp_msg_null);
6826     }
6827   }
6828 #endif
6829 
6830 #if KMP_HANDLE_SIGNALS
6831 #if KMP_OS_UNIX
6832   /* NOTE: make sure that this is called before the user installs their own
6833      signal handlers so that the user handlers are called first. this way they
6834      can return false, not call our handler, avoid terminating the library, and
6835      continue execution where they left off. */
6836   __kmp_install_signals(FALSE);
6837 #endif /* KMP_OS_UNIX */
6838 #if KMP_OS_WINDOWS
6839   __kmp_install_signals(TRUE);
6840 #endif /* KMP_OS_WINDOWS */
6841 #endif
6842 
6843   /* we have finished the serial initialization */
6844   __kmp_init_counter++;
6845 
6846   __kmp_init_serial = TRUE;
6847 
6848   if (__kmp_settings) {
6849     __kmp_env_print();
6850   }
6851 
6852   if (__kmp_display_env || __kmp_display_env_verbose) {
6853     __kmp_env_print_2();
6854   }
6855 
6856 #if OMPT_SUPPORT
6857   ompt_post_init();
6858 #endif
6859 
6860   KMP_MB();
6861 
6862   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6863 }
6864 
6865 void __kmp_serial_initialize(void) {
6866   if (__kmp_init_serial) {
6867     return;
6868   }
6869   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6870   if (__kmp_init_serial) {
6871     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6872     return;
6873   }
6874   __kmp_do_serial_initialize();
6875   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6876 }
6877 
6878 static void __kmp_do_middle_initialize(void) {
6879   int i, j;
6880   int prev_dflt_team_nth;
6881 
6882   if (!__kmp_init_serial) {
6883     __kmp_do_serial_initialize();
6884   }
6885 
6886   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6887 
6888   // Save the previous value for the __kmp_dflt_team_nth so that
6889   // we can avoid some reinitialization if it hasn't changed.
6890   prev_dflt_team_nth = __kmp_dflt_team_nth;
6891 
6892 #if KMP_AFFINITY_SUPPORTED
6893   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6894   // number of cores on the machine.
6895   __kmp_affinity_initialize();
6896 
6897   // Run through the __kmp_threads array and set the affinity mask
6898   // for each root thread that is currently registered with the RTL.
6899   for (i = 0; i < __kmp_threads_capacity; i++) {
6900     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6901       __kmp_affinity_set_init_mask(i, TRUE);
6902     }
6903   }
6904 #endif /* KMP_AFFINITY_SUPPORTED */
6905 
6906   KMP_ASSERT(__kmp_xproc > 0);
6907   if (__kmp_avail_proc == 0) {
6908     __kmp_avail_proc = __kmp_xproc;
6909   }
6910 
6911   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6912   // correct them now
6913   j = 0;
6914   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6915     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6916         __kmp_avail_proc;
6917     j++;
6918   }
6919 
6920   if (__kmp_dflt_team_nth == 0) {
6921 #ifdef KMP_DFLT_NTH_CORES
6922     // Default #threads = #cores
6923     __kmp_dflt_team_nth = __kmp_ncores;
6924     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6925                   "__kmp_ncores (%d)\n",
6926                   __kmp_dflt_team_nth));
6927 #else
6928     // Default #threads = #available OS procs
6929     __kmp_dflt_team_nth = __kmp_avail_proc;
6930     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6931                   "__kmp_avail_proc(%d)\n",
6932                   __kmp_dflt_team_nth));
6933 #endif /* KMP_DFLT_NTH_CORES */
6934   }
6935 
6936   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6937     __kmp_dflt_team_nth = KMP_MIN_NTH;
6938   }
6939   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6940     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6941   }
6942 
6943   // There's no harm in continuing if the following check fails,
6944   // but it indicates an error in the previous logic.
6945   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6946 
6947   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6948     // Run through the __kmp_threads array and set the num threads icv for each
6949     // root thread that is currently registered with the RTL (which has not
6950     // already explicitly set its nthreads-var with a call to
6951     // omp_set_num_threads()).
6952     for (i = 0; i < __kmp_threads_capacity; i++) {
6953       kmp_info_t *thread = __kmp_threads[i];
6954       if (thread == NULL)
6955         continue;
6956       if (thread->th.th_current_task->td_icvs.nproc != 0)
6957         continue;
6958 
6959       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6960     }
6961   }
6962   KA_TRACE(
6963       20,
6964       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6965        __kmp_dflt_team_nth));
6966 
6967 #ifdef KMP_ADJUST_BLOCKTIME
6968   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6969   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6970     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6971     if (__kmp_nth > __kmp_avail_proc) {
6972       __kmp_zero_bt = TRUE;
6973     }
6974   }
6975 #endif /* KMP_ADJUST_BLOCKTIME */
6976 
6977   /* we have finished middle initialization */
6978   TCW_SYNC_4(__kmp_init_middle, TRUE);
6979 
6980   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6981 }
6982 
6983 void __kmp_middle_initialize(void) {
6984   if (__kmp_init_middle) {
6985     return;
6986   }
6987   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6988   if (__kmp_init_middle) {
6989     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6990     return;
6991   }
6992   __kmp_do_middle_initialize();
6993   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6994 }
6995 
6996 void __kmp_parallel_initialize(void) {
6997   int gtid = __kmp_entry_gtid(); // this might be a new root
6998 
6999   /* synchronize parallel initialization (for sibling) */
7000   if (TCR_4(__kmp_init_parallel))
7001     return;
7002   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7003   if (TCR_4(__kmp_init_parallel)) {
7004     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7005     return;
7006   }
7007 
7008   /* TODO reinitialization after we have already shut down */
7009   if (TCR_4(__kmp_global.g.g_done)) {
7010     KA_TRACE(
7011         10,
7012         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7013     __kmp_infinite_loop();
7014   }
7015 
7016   /* jc: The lock __kmp_initz_lock is already held, so calling
7017      __kmp_serial_initialize would cause a deadlock.  So we call
7018      __kmp_do_serial_initialize directly. */
7019   if (!__kmp_init_middle) {
7020     __kmp_do_middle_initialize();
7021   }
7022   __kmp_resume_if_hard_paused();
7023 
7024   /* begin initialization */
7025   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7026   KMP_ASSERT(KMP_UBER_GTID(gtid));
7027 
7028 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7029   // Save the FP control regs.
7030   // Worker threads will set theirs to these values at thread startup.
7031   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7032   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7033   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7034 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7035 
7036 #if KMP_OS_UNIX
7037 #if KMP_HANDLE_SIGNALS
7038   /*  must be after __kmp_serial_initialize  */
7039   __kmp_install_signals(TRUE);
7040 #endif
7041 #endif
7042 
7043   __kmp_suspend_initialize();
7044 
7045 #if defined(USE_LOAD_BALANCE)
7046   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7047     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7048   }
7049 #else
7050   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7051     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7052   }
7053 #endif
7054 
7055   if (__kmp_version) {
7056     __kmp_print_version_2();
7057   }
7058 
7059   /* we have finished parallel initialization */
7060   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7061 
7062   KMP_MB();
7063   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7064 
7065   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7066 }
7067 
7068 /* ------------------------------------------------------------------------ */
7069 
7070 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7071                                    kmp_team_t *team) {
7072   kmp_disp_t *dispatch;
7073 
7074   KMP_MB();
7075 
7076   /* none of the threads have encountered any constructs, yet. */
7077   this_thr->th.th_local.this_construct = 0;
7078 #if KMP_CACHE_MANAGE
7079   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7080 #endif /* KMP_CACHE_MANAGE */
7081   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7082   KMP_DEBUG_ASSERT(dispatch);
7083   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7084   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7085   // this_thr->th.th_info.ds.ds_tid ] );
7086 
7087   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7088   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7089   if (__kmp_env_consistency_check)
7090     __kmp_push_parallel(gtid, team->t.t_ident);
7091 
7092   KMP_MB(); /* Flush all pending memory write invalidates.  */
7093 }
7094 
7095 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7096                                   kmp_team_t *team) {
7097   if (__kmp_env_consistency_check)
7098     __kmp_pop_parallel(gtid, team->t.t_ident);
7099 
7100   __kmp_finish_implicit_task(this_thr);
7101 }
7102 
7103 int __kmp_invoke_task_func(int gtid) {
7104   int rc;
7105   int tid = __kmp_tid_from_gtid(gtid);
7106   kmp_info_t *this_thr = __kmp_threads[gtid];
7107   kmp_team_t *team = this_thr->th.th_team;
7108 
7109   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7110 #if USE_ITT_BUILD
7111   if (__itt_stack_caller_create_ptr) {
7112     __kmp_itt_stack_callee_enter(
7113         (__itt_caller)
7114             team->t.t_stack_id); // inform ittnotify about entering user's code
7115   }
7116 #endif /* USE_ITT_BUILD */
7117 #if INCLUDE_SSC_MARKS
7118   SSC_MARK_INVOKING();
7119 #endif
7120 
7121 #if OMPT_SUPPORT
7122   void *dummy;
7123   void **exit_frame_p;
7124   ompt_data_t *my_task_data;
7125   ompt_data_t *my_parallel_data;
7126   int ompt_team_size;
7127 
7128   if (ompt_enabled.enabled) {
7129     exit_frame_p = &(
7130         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7131   } else {
7132     exit_frame_p = &dummy;
7133   }
7134 
7135   my_task_data =
7136       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7137   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7138   if (ompt_enabled.ompt_callback_implicit_task) {
7139     ompt_team_size = team->t.t_nproc;
7140     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7141         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7142         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7143     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7144   }
7145 #endif
7146 
7147 #if KMP_STATS_ENABLED
7148   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7149   if (previous_state == stats_state_e::TEAMS_REGION) {
7150     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7151   } else {
7152     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7153   }
7154   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7155 #endif
7156 
7157   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7158                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7159 #if OMPT_SUPPORT
7160                               ,
7161                               exit_frame_p
7162 #endif
7163                               );
7164 #if OMPT_SUPPORT
7165   *exit_frame_p = NULL;
7166    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7167 #endif
7168 
7169 #if KMP_STATS_ENABLED
7170   if (previous_state == stats_state_e::TEAMS_REGION) {
7171     KMP_SET_THREAD_STATE(previous_state);
7172   }
7173   KMP_POP_PARTITIONED_TIMER();
7174 #endif
7175 
7176 #if USE_ITT_BUILD
7177   if (__itt_stack_caller_create_ptr) {
7178     __kmp_itt_stack_callee_leave(
7179         (__itt_caller)
7180             team->t.t_stack_id); // inform ittnotify about leaving user's code
7181   }
7182 #endif /* USE_ITT_BUILD */
7183   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7184 
7185   return rc;
7186 }
7187 
7188 void __kmp_teams_master(int gtid) {
7189   // This routine is called by all master threads in teams construct
7190   kmp_info_t *thr = __kmp_threads[gtid];
7191   kmp_team_t *team = thr->th.th_team;
7192   ident_t *loc = team->t.t_ident;
7193   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7194   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7195   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7196   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7197                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7198 
7199   // This thread is a new CG root.  Set up the proper variables.
7200   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7201   tmp->cg_root = thr; // Make thr the CG root
7202   // Init to thread limit that was stored when league masters were forked
7203   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7204   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7205   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7206                  " cg_nthreads to 1\n",
7207                  thr, tmp));
7208   tmp->up = thr->th.th_cg_roots;
7209   thr->th.th_cg_roots = tmp;
7210 
7211 // Launch league of teams now, but not let workers execute
7212 // (they hang on fork barrier until next parallel)
7213 #if INCLUDE_SSC_MARKS
7214   SSC_MARK_FORKING();
7215 #endif
7216   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7217                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7218                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7219 #if INCLUDE_SSC_MARKS
7220   SSC_MARK_JOINING();
7221 #endif
7222   // If the team size was reduced from the limit, set it to the new size
7223   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7224     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7225   // AC: last parameter "1" eliminates join barrier which won't work because
7226   // worker threads are in a fork barrier waiting for more parallel regions
7227   __kmp_join_call(loc, gtid
7228 #if OMPT_SUPPORT
7229                   ,
7230                   fork_context_intel
7231 #endif
7232                   ,
7233                   1);
7234 }
7235 
7236 int __kmp_invoke_teams_master(int gtid) {
7237   kmp_info_t *this_thr = __kmp_threads[gtid];
7238   kmp_team_t *team = this_thr->th.th_team;
7239 #if KMP_DEBUG
7240   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7241     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7242                      (void *)__kmp_teams_master);
7243 #endif
7244   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7245 #if OMPT_SUPPORT
7246   int tid = __kmp_tid_from_gtid(gtid);
7247   ompt_data_t *task_data =
7248       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7249   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7250   if (ompt_enabled.ompt_callback_implicit_task) {
7251     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7252         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7253         ompt_task_initial);
7254     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7255   }
7256 #endif
7257   __kmp_teams_master(gtid);
7258 #if OMPT_SUPPORT
7259   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7260 #endif
7261   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7262   return 1;
7263 }
7264 
7265 /* this sets the requested number of threads for the next parallel region
7266    encountered by this team. since this should be enclosed in the forkjoin
7267    critical section it should avoid race conditions with asymmetrical nested
7268    parallelism */
7269 
7270 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7271   kmp_info_t *thr = __kmp_threads[gtid];
7272 
7273   if (num_threads > 0)
7274     thr->th.th_set_nproc = num_threads;
7275 }
7276 
7277 /* this sets the requested number of teams for the teams region and/or
7278    the number of threads for the next parallel region encountered  */
7279 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7280                           int num_threads) {
7281   kmp_info_t *thr = __kmp_threads[gtid];
7282   KMP_DEBUG_ASSERT(num_teams >= 0);
7283   KMP_DEBUG_ASSERT(num_threads >= 0);
7284 
7285   if (num_teams == 0)
7286     num_teams = 1; // default number of teams is 1.
7287   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7288     if (!__kmp_reserve_warn) {
7289       __kmp_reserve_warn = 1;
7290       __kmp_msg(kmp_ms_warning,
7291                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7292                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7293     }
7294     num_teams = __kmp_teams_max_nth;
7295   }
7296   // Set number of teams (number of threads in the outer "parallel" of the
7297   // teams)
7298   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7299 
7300   // Remember the number of threads for inner parallel regions
7301   if (!TCR_4(__kmp_init_middle))
7302     __kmp_middle_initialize(); // get internal globals calculated
7303   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7304   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7305   if (num_threads == 0) {
7306     num_threads = __kmp_avail_proc / num_teams;
7307     // adjust num_threads w/o warning as it is not user setting
7308     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7309     // no thread_limit clause specified -  do not change thread-limit-var ICV
7310     if (num_threads > __kmp_dflt_team_nth) {
7311       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7312     }
7313     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7314       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7315     } // prevent team size to exceed thread-limit-var
7316     if (num_teams * num_threads > __kmp_teams_max_nth) {
7317       num_threads = __kmp_teams_max_nth / num_teams;
7318     }
7319   } else {
7320     // This thread will be the master of the league masters
7321     // Store new thread limit; old limit is saved in th_cg_roots list
7322     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7323     // num_threads = min(num_threads, nthreads-var)
7324     if (num_threads > __kmp_dflt_team_nth) {
7325       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7326     }
7327     if (num_teams * num_threads > __kmp_teams_max_nth) {
7328       int new_threads = __kmp_teams_max_nth / num_teams;
7329       if (!__kmp_reserve_warn) { // user asked for too many threads
7330         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7331         __kmp_msg(kmp_ms_warning,
7332                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7333                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7334       }
7335       num_threads = new_threads;
7336     }
7337   }
7338   thr->th.th_teams_size.nth = num_threads;
7339 }
7340 
7341 // Set the proc_bind var to use in the following parallel region.
7342 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7343   kmp_info_t *thr = __kmp_threads[gtid];
7344   thr->th.th_set_proc_bind = proc_bind;
7345 }
7346 
7347 /* Launch the worker threads into the microtask. */
7348 
7349 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7350   kmp_info_t *this_thr = __kmp_threads[gtid];
7351 
7352 #ifdef KMP_DEBUG
7353   int f;
7354 #endif /* KMP_DEBUG */
7355 
7356   KMP_DEBUG_ASSERT(team);
7357   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7358   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7359   KMP_MB(); /* Flush all pending memory write invalidates.  */
7360 
7361   team->t.t_construct = 0; /* no single directives seen yet */
7362   team->t.t_ordered.dt.t_value =
7363       0; /* thread 0 enters the ordered section first */
7364 
7365   /* Reset the identifiers on the dispatch buffer */
7366   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7367   if (team->t.t_max_nproc > 1) {
7368     int i;
7369     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7370       team->t.t_disp_buffer[i].buffer_index = i;
7371       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7372     }
7373   } else {
7374     team->t.t_disp_buffer[0].buffer_index = 0;
7375     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7376   }
7377 
7378   KMP_MB(); /* Flush all pending memory write invalidates.  */
7379   KMP_ASSERT(this_thr->th.th_team == team);
7380 
7381 #ifdef KMP_DEBUG
7382   for (f = 0; f < team->t.t_nproc; f++) {
7383     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7384                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7385   }
7386 #endif /* KMP_DEBUG */
7387 
7388   /* release the worker threads so they may begin working */
7389   __kmp_fork_barrier(gtid, 0);
7390 }
7391 
7392 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7393   kmp_info_t *this_thr = __kmp_threads[gtid];
7394 
7395   KMP_DEBUG_ASSERT(team);
7396   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7397   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7398   KMP_MB(); /* Flush all pending memory write invalidates.  */
7399 
7400 /* Join barrier after fork */
7401 
7402 #ifdef KMP_DEBUG
7403   if (__kmp_threads[gtid] &&
7404       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7405     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7406                  __kmp_threads[gtid]);
7407     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7408                  "team->t.t_nproc=%d\n",
7409                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7410                  team->t.t_nproc);
7411     __kmp_print_structure();
7412   }
7413   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7414                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7415 #endif /* KMP_DEBUG */
7416 
7417   __kmp_join_barrier(gtid); /* wait for everyone */
7418 #if OMPT_SUPPORT
7419   if (ompt_enabled.enabled &&
7420       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7421     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7422     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7423     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7424 #if OMPT_OPTIONAL
7425     void *codeptr = NULL;
7426     if (KMP_MASTER_TID(ds_tid) &&
7427         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7428          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7429       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7430 
7431     if (ompt_enabled.ompt_callback_sync_region_wait) {
7432       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7433           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7434           codeptr);
7435     }
7436     if (ompt_enabled.ompt_callback_sync_region) {
7437       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7438           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7439           codeptr);
7440     }
7441 #endif
7442     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7443       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7444           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7445     }
7446   }
7447 #endif
7448 
7449   KMP_MB(); /* Flush all pending memory write invalidates.  */
7450   KMP_ASSERT(this_thr->th.th_team == team);
7451 }
7452 
7453 /* ------------------------------------------------------------------------ */
7454 
7455 #ifdef USE_LOAD_BALANCE
7456 
7457 // Return the worker threads actively spinning in the hot team, if we
7458 // are at the outermost level of parallelism.  Otherwise, return 0.
7459 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7460   int i;
7461   int retval;
7462   kmp_team_t *hot_team;
7463 
7464   if (root->r.r_active) {
7465     return 0;
7466   }
7467   hot_team = root->r.r_hot_team;
7468   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7469     return hot_team->t.t_nproc - 1; // Don't count master thread
7470   }
7471 
7472   // Skip the master thread - it is accounted for elsewhere.
7473   retval = 0;
7474   for (i = 1; i < hot_team->t.t_nproc; i++) {
7475     if (hot_team->t.t_threads[i]->th.th_active) {
7476       retval++;
7477     }
7478   }
7479   return retval;
7480 }
7481 
7482 // Perform an automatic adjustment to the number of
7483 // threads used by the next parallel region.
7484 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7485   int retval;
7486   int pool_active;
7487   int hot_team_active;
7488   int team_curr_active;
7489   int system_active;
7490 
7491   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7492                 set_nproc));
7493   KMP_DEBUG_ASSERT(root);
7494   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7495                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7496   KMP_DEBUG_ASSERT(set_nproc > 1);
7497 
7498   if (set_nproc == 1) {
7499     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7500     return 1;
7501   }
7502 
7503   // Threads that are active in the thread pool, active in the hot team for this
7504   // particular root (if we are at the outer par level), and the currently
7505   // executing thread (to become the master) are available to add to the new
7506   // team, but are currently contributing to the system load, and must be
7507   // accounted for.
7508   pool_active = __kmp_thread_pool_active_nth;
7509   hot_team_active = __kmp_active_hot_team_nproc(root);
7510   team_curr_active = pool_active + hot_team_active + 1;
7511 
7512   // Check the system load.
7513   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7514   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7515                 "hot team active = %d\n",
7516                 system_active, pool_active, hot_team_active));
7517 
7518   if (system_active < 0) {
7519     // There was an error reading the necessary info from /proc, so use the
7520     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7521     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7522     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7523     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7524 
7525     // Make this call behave like the thread limit algorithm.
7526     retval = __kmp_avail_proc - __kmp_nth +
7527              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7528     if (retval > set_nproc) {
7529       retval = set_nproc;
7530     }
7531     if (retval < KMP_MIN_NTH) {
7532       retval = KMP_MIN_NTH;
7533     }
7534 
7535     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7536                   retval));
7537     return retval;
7538   }
7539 
7540   // There is a slight delay in the load balance algorithm in detecting new
7541   // running procs. The real system load at this instant should be at least as
7542   // large as the #active omp thread that are available to add to the team.
7543   if (system_active < team_curr_active) {
7544     system_active = team_curr_active;
7545   }
7546   retval = __kmp_avail_proc - system_active + team_curr_active;
7547   if (retval > set_nproc) {
7548     retval = set_nproc;
7549   }
7550   if (retval < KMP_MIN_NTH) {
7551     retval = KMP_MIN_NTH;
7552   }
7553 
7554   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7555   return retval;
7556 } // __kmp_load_balance_nproc()
7557 
7558 #endif /* USE_LOAD_BALANCE */
7559 
7560 /* ------------------------------------------------------------------------ */
7561 
7562 /* NOTE: this is called with the __kmp_init_lock held */
7563 void __kmp_cleanup(void) {
7564   int f;
7565 
7566   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7567 
7568   if (TCR_4(__kmp_init_parallel)) {
7569 #if KMP_HANDLE_SIGNALS
7570     __kmp_remove_signals();
7571 #endif
7572     TCW_4(__kmp_init_parallel, FALSE);
7573   }
7574 
7575   if (TCR_4(__kmp_init_middle)) {
7576 #if KMP_AFFINITY_SUPPORTED
7577     __kmp_affinity_uninitialize();
7578 #endif /* KMP_AFFINITY_SUPPORTED */
7579     __kmp_cleanup_hierarchy();
7580     TCW_4(__kmp_init_middle, FALSE);
7581   }
7582 
7583   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7584 
7585   if (__kmp_init_serial) {
7586     __kmp_runtime_destroy();
7587     __kmp_init_serial = FALSE;
7588   }
7589 
7590   __kmp_cleanup_threadprivate_caches();
7591 
7592   for (f = 0; f < __kmp_threads_capacity; f++) {
7593     if (__kmp_root[f] != NULL) {
7594       __kmp_free(__kmp_root[f]);
7595       __kmp_root[f] = NULL;
7596     }
7597   }
7598   __kmp_free(__kmp_threads);
7599   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7600   // there is no need in freeing __kmp_root.
7601   __kmp_threads = NULL;
7602   __kmp_root = NULL;
7603   __kmp_threads_capacity = 0;
7604 
7605 #if KMP_USE_DYNAMIC_LOCK
7606   __kmp_cleanup_indirect_user_locks();
7607 #else
7608   __kmp_cleanup_user_locks();
7609 #endif
7610 
7611 #if KMP_AFFINITY_SUPPORTED
7612   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7613   __kmp_cpuinfo_file = NULL;
7614 #endif /* KMP_AFFINITY_SUPPORTED */
7615 
7616 #if KMP_USE_ADAPTIVE_LOCKS
7617 #if KMP_DEBUG_ADAPTIVE_LOCKS
7618   __kmp_print_speculative_stats();
7619 #endif
7620 #endif
7621   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7622   __kmp_nested_nth.nth = NULL;
7623   __kmp_nested_nth.size = 0;
7624   __kmp_nested_nth.used = 0;
7625   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7626   __kmp_nested_proc_bind.bind_types = NULL;
7627   __kmp_nested_proc_bind.size = 0;
7628   __kmp_nested_proc_bind.used = 0;
7629   if (__kmp_affinity_format) {
7630     KMP_INTERNAL_FREE(__kmp_affinity_format);
7631     __kmp_affinity_format = NULL;
7632   }
7633 
7634   __kmp_i18n_catclose();
7635 
7636 #if KMP_USE_HIER_SCHED
7637   __kmp_hier_scheds.deallocate();
7638 #endif
7639 
7640 #if KMP_STATS_ENABLED
7641   __kmp_stats_fini();
7642 #endif
7643 
7644   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7645 }
7646 
7647 /* ------------------------------------------------------------------------ */
7648 
7649 int __kmp_ignore_mppbeg(void) {
7650   char *env;
7651 
7652   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7653     if (__kmp_str_match_false(env))
7654       return FALSE;
7655   }
7656   // By default __kmpc_begin() is no-op.
7657   return TRUE;
7658 }
7659 
7660 int __kmp_ignore_mppend(void) {
7661   char *env;
7662 
7663   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7664     if (__kmp_str_match_false(env))
7665       return FALSE;
7666   }
7667   // By default __kmpc_end() is no-op.
7668   return TRUE;
7669 }
7670 
7671 void __kmp_internal_begin(void) {
7672   int gtid;
7673   kmp_root_t *root;
7674 
7675   /* this is a very important step as it will register new sibling threads
7676      and assign these new uber threads a new gtid */
7677   gtid = __kmp_entry_gtid();
7678   root = __kmp_threads[gtid]->th.th_root;
7679   KMP_ASSERT(KMP_UBER_GTID(gtid));
7680 
7681   if (root->r.r_begin)
7682     return;
7683   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7684   if (root->r.r_begin) {
7685     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7686     return;
7687   }
7688 
7689   root->r.r_begin = TRUE;
7690 
7691   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7692 }
7693 
7694 /* ------------------------------------------------------------------------ */
7695 
7696 void __kmp_user_set_library(enum library_type arg) {
7697   int gtid;
7698   kmp_root_t *root;
7699   kmp_info_t *thread;
7700 
7701   /* first, make sure we are initialized so we can get our gtid */
7702 
7703   gtid = __kmp_entry_gtid();
7704   thread = __kmp_threads[gtid];
7705 
7706   root = thread->th.th_root;
7707 
7708   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7709                 library_serial));
7710   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7711                                   thread */
7712     KMP_WARNING(SetLibraryIncorrectCall);
7713     return;
7714   }
7715 
7716   switch (arg) {
7717   case library_serial:
7718     thread->th.th_set_nproc = 0;
7719     set__nproc(thread, 1);
7720     break;
7721   case library_turnaround:
7722     thread->th.th_set_nproc = 0;
7723     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7724                                            : __kmp_dflt_team_nth_ub);
7725     break;
7726   case library_throughput:
7727     thread->th.th_set_nproc = 0;
7728     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7729                                            : __kmp_dflt_team_nth_ub);
7730     break;
7731   default:
7732     KMP_FATAL(UnknownLibraryType, arg);
7733   }
7734 
7735   __kmp_aux_set_library(arg);
7736 }
7737 
7738 void __kmp_aux_set_stacksize(size_t arg) {
7739   if (!__kmp_init_serial)
7740     __kmp_serial_initialize();
7741 
7742 #if KMP_OS_DARWIN
7743   if (arg & (0x1000 - 1)) {
7744     arg &= ~(0x1000 - 1);
7745     if (arg + 0x1000) /* check for overflow if we round up */
7746       arg += 0x1000;
7747   }
7748 #endif
7749   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7750 
7751   /* only change the default stacksize before the first parallel region */
7752   if (!TCR_4(__kmp_init_parallel)) {
7753     size_t value = arg; /* argument is in bytes */
7754 
7755     if (value < __kmp_sys_min_stksize)
7756       value = __kmp_sys_min_stksize;
7757     else if (value > KMP_MAX_STKSIZE)
7758       value = KMP_MAX_STKSIZE;
7759 
7760     __kmp_stksize = value;
7761 
7762     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7763   }
7764 
7765   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7766 }
7767 
7768 /* set the behaviour of the runtime library */
7769 /* TODO this can cause some odd behaviour with sibling parallelism... */
7770 void __kmp_aux_set_library(enum library_type arg) {
7771   __kmp_library = arg;
7772 
7773   switch (__kmp_library) {
7774   case library_serial: {
7775     KMP_INFORM(LibraryIsSerial);
7776   } break;
7777   case library_turnaround:
7778     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7779       __kmp_use_yield = 2; // only yield when oversubscribed
7780     break;
7781   case library_throughput:
7782     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7783       __kmp_dflt_blocktime = 200;
7784     break;
7785   default:
7786     KMP_FATAL(UnknownLibraryType, arg);
7787   }
7788 }
7789 
7790 /* Getting team information common for all team API */
7791 // Returns NULL if not in teams construct
7792 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7793   kmp_info_t *thr = __kmp_entry_thread();
7794   teams_serialized = 0;
7795   if (thr->th.th_teams_microtask) {
7796     kmp_team_t *team = thr->th.th_team;
7797     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7798     int ii = team->t.t_level;
7799     teams_serialized = team->t.t_serialized;
7800     int level = tlevel + 1;
7801     KMP_DEBUG_ASSERT(ii >= tlevel);
7802     while (ii > level) {
7803       for (teams_serialized = team->t.t_serialized;
7804            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7805       }
7806       if (team->t.t_serialized && (!teams_serialized)) {
7807         team = team->t.t_parent;
7808         continue;
7809       }
7810       if (ii > level) {
7811         team = team->t.t_parent;
7812         ii--;
7813       }
7814     }
7815     return team;
7816   }
7817   return NULL;
7818 }
7819 
7820 int __kmp_aux_get_team_num() {
7821   int serialized;
7822   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7823   if (team) {
7824     if (serialized > 1) {
7825       return 0; // teams region is serialized ( 1 team of 1 thread ).
7826     } else {
7827       return team->t.t_master_tid;
7828     }
7829   }
7830   return 0;
7831 }
7832 
7833 int __kmp_aux_get_num_teams() {
7834   int serialized;
7835   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7836   if (team) {
7837     if (serialized > 1) {
7838       return 1;
7839     } else {
7840       return team->t.t_parent->t.t_nproc;
7841     }
7842   }
7843   return 1;
7844 }
7845 
7846 /* ------------------------------------------------------------------------ */
7847 
7848 /*
7849  * Affinity Format Parser
7850  *
7851  * Field is in form of: %[[[0].]size]type
7852  * % and type are required (%% means print a literal '%')
7853  * type is either single char or long name surrounded by {},
7854  * e.g., N or {num_threads}
7855  * 0 => leading zeros
7856  * . => right justified when size is specified
7857  * by default output is left justified
7858  * size is the *minimum* field length
7859  * All other characters are printed as is
7860  *
7861  * Available field types:
7862  * L {thread_level}      - omp_get_level()
7863  * n {thread_num}        - omp_get_thread_num()
7864  * h {host}              - name of host machine
7865  * P {process_id}        - process id (integer)
7866  * T {thread_identifier} - native thread identifier (integer)
7867  * N {num_threads}       - omp_get_num_threads()
7868  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7869  * a {thread_affinity}   - comma separated list of integers or integer ranges
7870  *                         (values of affinity mask)
7871  *
7872  * Implementation-specific field types can be added
7873  * If a type is unknown, print "undefined"
7874 */
7875 
7876 // Structure holding the short name, long name, and corresponding data type
7877 // for snprintf.  A table of these will represent the entire valid keyword
7878 // field types.
7879 typedef struct kmp_affinity_format_field_t {
7880   char short_name; // from spec e.g., L -> thread level
7881   const char *long_name; // from spec thread_level -> thread level
7882   char field_format; // data type for snprintf (typically 'd' or 's'
7883   // for integer or string)
7884 } kmp_affinity_format_field_t;
7885 
7886 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7887 #if KMP_AFFINITY_SUPPORTED
7888     {'A', "thread_affinity", 's'},
7889 #endif
7890     {'t', "team_num", 'd'},
7891     {'T', "num_teams", 'd'},
7892     {'L', "nesting_level", 'd'},
7893     {'n', "thread_num", 'd'},
7894     {'N', "num_threads", 'd'},
7895     {'a', "ancestor_tnum", 'd'},
7896     {'H', "host", 's'},
7897     {'P', "process_id", 'd'},
7898     {'i', "native_thread_id", 'd'}};
7899 
7900 // Return the number of characters it takes to hold field
7901 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7902                                             const char **ptr,
7903                                             kmp_str_buf_t *field_buffer) {
7904   int rc, format_index, field_value;
7905   const char *width_left, *width_right;
7906   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7907   static const int FORMAT_SIZE = 20;
7908   char format[FORMAT_SIZE] = {0};
7909   char absolute_short_name = 0;
7910 
7911   KMP_DEBUG_ASSERT(gtid >= 0);
7912   KMP_DEBUG_ASSERT(th);
7913   KMP_DEBUG_ASSERT(**ptr == '%');
7914   KMP_DEBUG_ASSERT(field_buffer);
7915 
7916   __kmp_str_buf_clear(field_buffer);
7917 
7918   // Skip the initial %
7919   (*ptr)++;
7920 
7921   // Check for %% first
7922   if (**ptr == '%') {
7923     __kmp_str_buf_cat(field_buffer, "%", 1);
7924     (*ptr)++; // skip over the second %
7925     return 1;
7926   }
7927 
7928   // Parse field modifiers if they are present
7929   pad_zeros = false;
7930   if (**ptr == '0') {
7931     pad_zeros = true;
7932     (*ptr)++; // skip over 0
7933   }
7934   right_justify = false;
7935   if (**ptr == '.') {
7936     right_justify = true;
7937     (*ptr)++; // skip over .
7938   }
7939   // Parse width of field: [width_left, width_right)
7940   width_left = width_right = NULL;
7941   if (**ptr >= '0' && **ptr <= '9') {
7942     width_left = *ptr;
7943     SKIP_DIGITS(*ptr);
7944     width_right = *ptr;
7945   }
7946 
7947   // Create the format for KMP_SNPRINTF based on flags parsed above
7948   format_index = 0;
7949   format[format_index++] = '%';
7950   if (!right_justify)
7951     format[format_index++] = '-';
7952   if (pad_zeros)
7953     format[format_index++] = '0';
7954   if (width_left && width_right) {
7955     int i = 0;
7956     // Only allow 8 digit number widths.
7957     // This also prevents overflowing format variable
7958     while (i < 8 && width_left < width_right) {
7959       format[format_index++] = *width_left;
7960       width_left++;
7961       i++;
7962     }
7963   }
7964 
7965   // Parse a name (long or short)
7966   // Canonicalize the name into absolute_short_name
7967   found_valid_name = false;
7968   parse_long_name = (**ptr == '{');
7969   if (parse_long_name)
7970     (*ptr)++; // skip initial left brace
7971   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7972                              sizeof(__kmp_affinity_format_table[0]);
7973        ++i) {
7974     char short_name = __kmp_affinity_format_table[i].short_name;
7975     const char *long_name = __kmp_affinity_format_table[i].long_name;
7976     char field_format = __kmp_affinity_format_table[i].field_format;
7977     if (parse_long_name) {
7978       int length = KMP_STRLEN(long_name);
7979       if (strncmp(*ptr, long_name, length) == 0) {
7980         found_valid_name = true;
7981         (*ptr) += length; // skip the long name
7982       }
7983     } else if (**ptr == short_name) {
7984       found_valid_name = true;
7985       (*ptr)++; // skip the short name
7986     }
7987     if (found_valid_name) {
7988       format[format_index++] = field_format;
7989       format[format_index++] = '\0';
7990       absolute_short_name = short_name;
7991       break;
7992     }
7993   }
7994   if (parse_long_name) {
7995     if (**ptr != '}') {
7996       absolute_short_name = 0;
7997     } else {
7998       (*ptr)++; // skip over the right brace
7999     }
8000   }
8001 
8002   // Attempt to fill the buffer with the requested
8003   // value using snprintf within __kmp_str_buf_print()
8004   switch (absolute_short_name) {
8005   case 't':
8006     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8007     break;
8008   case 'T':
8009     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8010     break;
8011   case 'L':
8012     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8013     break;
8014   case 'n':
8015     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8016     break;
8017   case 'H': {
8018     static const int BUFFER_SIZE = 256;
8019     char buf[BUFFER_SIZE];
8020     __kmp_expand_host_name(buf, BUFFER_SIZE);
8021     rc = __kmp_str_buf_print(field_buffer, format, buf);
8022   } break;
8023   case 'P':
8024     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8025     break;
8026   case 'i':
8027     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8028     break;
8029   case 'N':
8030     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8031     break;
8032   case 'a':
8033     field_value =
8034         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8035     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8036     break;
8037 #if KMP_AFFINITY_SUPPORTED
8038   case 'A': {
8039     kmp_str_buf_t buf;
8040     __kmp_str_buf_init(&buf);
8041     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8042     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8043     __kmp_str_buf_free(&buf);
8044   } break;
8045 #endif
8046   default:
8047     // According to spec, If an implementation does not have info for field
8048     // type, then "undefined" is printed
8049     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8050     // Skip the field
8051     if (parse_long_name) {
8052       SKIP_TOKEN(*ptr);
8053       if (**ptr == '}')
8054         (*ptr)++;
8055     } else {
8056       (*ptr)++;
8057     }
8058   }
8059 
8060   KMP_ASSERT(format_index <= FORMAT_SIZE);
8061   return rc;
8062 }
8063 
8064 /*
8065  * Return number of characters needed to hold the affinity string
8066  * (not including null byte character)
8067  * The resultant string is printed to buffer, which the caller can then
8068  * handle afterwards
8069 */
8070 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8071                                   kmp_str_buf_t *buffer) {
8072   const char *parse_ptr;
8073   size_t retval;
8074   const kmp_info_t *th;
8075   kmp_str_buf_t field;
8076 
8077   KMP_DEBUG_ASSERT(buffer);
8078   KMP_DEBUG_ASSERT(gtid >= 0);
8079 
8080   __kmp_str_buf_init(&field);
8081   __kmp_str_buf_clear(buffer);
8082 
8083   th = __kmp_threads[gtid];
8084   retval = 0;
8085 
8086   // If format is NULL or zero-length string, then we use
8087   // affinity-format-var ICV
8088   parse_ptr = format;
8089   if (parse_ptr == NULL || *parse_ptr == '\0') {
8090     parse_ptr = __kmp_affinity_format;
8091   }
8092   KMP_DEBUG_ASSERT(parse_ptr);
8093 
8094   while (*parse_ptr != '\0') {
8095     // Parse a field
8096     if (*parse_ptr == '%') {
8097       // Put field in the buffer
8098       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8099       __kmp_str_buf_catbuf(buffer, &field);
8100       retval += rc;
8101     } else {
8102       // Put literal character in buffer
8103       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8104       retval++;
8105       parse_ptr++;
8106     }
8107   }
8108   __kmp_str_buf_free(&field);
8109   return retval;
8110 }
8111 
8112 // Displays the affinity string to stdout
8113 void __kmp_aux_display_affinity(int gtid, const char *format) {
8114   kmp_str_buf_t buf;
8115   __kmp_str_buf_init(&buf);
8116   __kmp_aux_capture_affinity(gtid, format, &buf);
8117   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8118   __kmp_str_buf_free(&buf);
8119 }
8120 
8121 /* ------------------------------------------------------------------------ */
8122 
8123 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8124   int blocktime = arg; /* argument is in milliseconds */
8125 #if KMP_USE_MONITOR
8126   int bt_intervals;
8127 #endif
8128   int bt_set;
8129 
8130   __kmp_save_internal_controls(thread);
8131 
8132   /* Normalize and set blocktime for the teams */
8133   if (blocktime < KMP_MIN_BLOCKTIME)
8134     blocktime = KMP_MIN_BLOCKTIME;
8135   else if (blocktime > KMP_MAX_BLOCKTIME)
8136     blocktime = KMP_MAX_BLOCKTIME;
8137 
8138   set__blocktime_team(thread->th.th_team, tid, blocktime);
8139   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8140 
8141 #if KMP_USE_MONITOR
8142   /* Calculate and set blocktime intervals for the teams */
8143   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8144 
8145   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8146   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8147 #endif
8148 
8149   /* Set whether blocktime has been set to "TRUE" */
8150   bt_set = TRUE;
8151 
8152   set__bt_set_team(thread->th.th_team, tid, bt_set);
8153   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8154 #if KMP_USE_MONITOR
8155   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8156                 "bt_intervals=%d, monitor_updates=%d\n",
8157                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8158                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8159                 __kmp_monitor_wakeups));
8160 #else
8161   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8162                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8163                 thread->th.th_team->t.t_id, tid, blocktime));
8164 #endif
8165 }
8166 
8167 void __kmp_aux_set_defaults(char const *str, int len) {
8168   if (!__kmp_init_serial) {
8169     __kmp_serial_initialize();
8170   }
8171   __kmp_env_initialize(str);
8172 
8173   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8174     __kmp_env_print();
8175   }
8176 } // __kmp_aux_set_defaults
8177 
8178 /* ------------------------------------------------------------------------ */
8179 /* internal fast reduction routines */
8180 
8181 PACKED_REDUCTION_METHOD_T
8182 __kmp_determine_reduction_method(
8183     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8184     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8185     kmp_critical_name *lck) {
8186 
8187   // Default reduction method: critical construct ( lck != NULL, like in current
8188   // PAROPT )
8189   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8190   // can be selected by RTL
8191   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8192   // can be selected by RTL
8193   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8194   // among generated by PAROPT.
8195 
8196   PACKED_REDUCTION_METHOD_T retval;
8197 
8198   int team_size;
8199 
8200   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8201   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8202 
8203 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8204   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8205 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8206 
8207   retval = critical_reduce_block;
8208 
8209   // another choice of getting a team size (with 1 dynamic deference) is slower
8210   team_size = __kmp_get_team_num_threads(global_tid);
8211   if (team_size == 1) {
8212 
8213     retval = empty_reduce_block;
8214 
8215   } else {
8216 
8217     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8218 
8219 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8220     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8221 
8222 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8223     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8224 
8225     int teamsize_cutoff = 4;
8226 
8227 #if KMP_MIC_SUPPORTED
8228     if (__kmp_mic_type != non_mic) {
8229       teamsize_cutoff = 8;
8230     }
8231 #endif
8232     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8233     if (tree_available) {
8234       if (team_size <= teamsize_cutoff) {
8235         if (atomic_available) {
8236           retval = atomic_reduce_block;
8237         }
8238       } else {
8239         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8240       }
8241     } else if (atomic_available) {
8242       retval = atomic_reduce_block;
8243     }
8244 #else
8245 #error "Unknown or unsupported OS"
8246 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8247        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8248 
8249 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8250 
8251 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8252 
8253     // basic tuning
8254 
8255     if (atomic_available) {
8256       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8257         retval = atomic_reduce_block;
8258       }
8259     } // otherwise: use critical section
8260 
8261 #elif KMP_OS_DARWIN
8262 
8263     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8264     if (atomic_available && (num_vars <= 3)) {
8265       retval = atomic_reduce_block;
8266     } else if (tree_available) {
8267       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8268           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8269         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8270       }
8271     } // otherwise: use critical section
8272 
8273 #else
8274 #error "Unknown or unsupported OS"
8275 #endif
8276 
8277 #else
8278 #error "Unknown or unsupported architecture"
8279 #endif
8280   }
8281 
8282   // KMP_FORCE_REDUCTION
8283 
8284   // If the team is serialized (team_size == 1), ignore the forced reduction
8285   // method and stay with the unsynchronized method (empty_reduce_block)
8286   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8287       team_size != 1) {
8288 
8289     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8290 
8291     int atomic_available, tree_available;
8292 
8293     switch ((forced_retval = __kmp_force_reduction_method)) {
8294     case critical_reduce_block:
8295       KMP_ASSERT(lck); // lck should be != 0
8296       break;
8297 
8298     case atomic_reduce_block:
8299       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8300       if (!atomic_available) {
8301         KMP_WARNING(RedMethodNotSupported, "atomic");
8302         forced_retval = critical_reduce_block;
8303       }
8304       break;
8305 
8306     case tree_reduce_block:
8307       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8308       if (!tree_available) {
8309         KMP_WARNING(RedMethodNotSupported, "tree");
8310         forced_retval = critical_reduce_block;
8311       } else {
8312 #if KMP_FAST_REDUCTION_BARRIER
8313         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8314 #endif
8315       }
8316       break;
8317 
8318     default:
8319       KMP_ASSERT(0); // "unsupported method specified"
8320     }
8321 
8322     retval = forced_retval;
8323   }
8324 
8325   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8326 
8327 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8328 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8329 
8330   return (retval);
8331 }
8332 // this function is for testing set/get/determine reduce method
8333 kmp_int32 __kmp_get_reduce_method(void) {
8334   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8335 }
8336 
8337 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8338 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8339 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8340 
8341 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8342 // OpenMP is used subsequently.
8343 void __kmp_hard_pause() {
8344   __kmp_pause_status = kmp_hard_paused;
8345   __kmp_internal_end_thread(-1);
8346 }
8347 
8348 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8349 void __kmp_resume_if_soft_paused() {
8350   if (__kmp_pause_status == kmp_soft_paused) {
8351     __kmp_pause_status = kmp_not_paused;
8352 
8353     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8354       kmp_info_t *thread = __kmp_threads[gtid];
8355       if (thread) { // Wake it if sleeping
8356         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8357         if (fl.is_sleeping())
8358           fl.resume(gtid);
8359         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8360           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8361         } else { // thread holds the lock and may sleep soon
8362           do { // until either the thread sleeps, or we can get the lock
8363             if (fl.is_sleeping()) {
8364               fl.resume(gtid);
8365               break;
8366             } else if (__kmp_try_suspend_mx(thread)) {
8367               __kmp_unlock_suspend_mx(thread);
8368               break;
8369             }
8370           } while (1);
8371         }
8372       }
8373     }
8374   }
8375 }
8376 
8377 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8378 // TODO: add warning messages
8379 int __kmp_pause_resource(kmp_pause_status_t level) {
8380   if (level == kmp_not_paused) { // requesting resume
8381     if (__kmp_pause_status == kmp_not_paused) {
8382       // error message about runtime not being paused, so can't resume
8383       return 1;
8384     } else {
8385       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8386                        __kmp_pause_status == kmp_hard_paused);
8387       __kmp_pause_status = kmp_not_paused;
8388       return 0;
8389     }
8390   } else if (level == kmp_soft_paused) { // requesting soft pause
8391     if (__kmp_pause_status != kmp_not_paused) {
8392       // error message about already being paused
8393       return 1;
8394     } else {
8395       __kmp_soft_pause();
8396       return 0;
8397     }
8398   } else if (level == kmp_hard_paused) { // requesting hard pause
8399     if (__kmp_pause_status != kmp_not_paused) {
8400       // error message about already being paused
8401       return 1;
8402     } else {
8403       __kmp_hard_pause();
8404       return 0;
8405     }
8406   } else {
8407     // error message about invalid level
8408     return 1;
8409   }
8410 }
8411 
8412 
8413 void __kmp_omp_display_env(int verbose) {
8414   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8415   if (__kmp_init_serial == 0)
8416     __kmp_do_serial_initialize();
8417   __kmp_display_env_impl(!verbose, verbose);
8418   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8419 }
8420