1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46     KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
50 #if OMP_50_ENABLED
51                                                         "5.0 (201611)";
52 #elif OMP_45_ENABLED
53                                                         "4.5 (201511)";
54 #elif OMP_40_ENABLED
55                                                         "4.0 (201307)";
56 #else
57                                                         "3.1 (201107)";
58 #endif
59 
60 #ifdef KMP_DEBUG
61 char const __kmp_version_lock[] =
62     KMP_VERSION_PREFIX "lock type: run time selectable";
63 #endif /* KMP_DEBUG */
64 
65 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
66 
67 /* ------------------------------------------------------------------------ */
68 
69 #if KMP_USE_MONITOR
70 kmp_info_t __kmp_monitor;
71 #endif
72 
73 /* Forward declarations */
74 
75 void __kmp_cleanup(void);
76 
77 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
78                                   int gtid);
79 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
80                                   kmp_internal_control_t *new_icvs,
81                                   ident_t *loc);
82 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
83 static void __kmp_partition_places(kmp_team_t *team,
84                                    int update_master_only = 0);
85 #endif
86 static void __kmp_do_serial_initialize(void);
87 void __kmp_fork_barrier(int gtid, int tid);
88 void __kmp_join_barrier(int gtid);
89 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
90                           kmp_internal_control_t *new_icvs, ident_t *loc);
91 
92 #ifdef USE_LOAD_BALANCE
93 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
94 #endif
95 
96 static int __kmp_expand_threads(int nNeed);
97 #if KMP_OS_WINDOWS
98 static int __kmp_unregister_root_other_thread(int gtid);
99 #endif
100 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108   int i;
109   kmp_info_t **other_threads;
110   size_t stack_data;
111   char *stack_addr;
112   size_t stack_size;
113   char *stack_base;
114 
115   KA_TRACE(
116       1000,
117       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
118        __kmp_nth, __kmp_all_nth));
119 
120   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123      __kmp_init_gtid for this to work. */
124 
125   if (!TCR_4(__kmp_init_gtid))
126     return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129   if (TCR_4(__kmp_gtid_mode) >= 3) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131     return __kmp_gtid;
132   }
133 #endif
134   if (TCR_4(__kmp_gtid_mode) >= 2) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136     return __kmp_gtid_get_specific();
137   }
138   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140   stack_addr = (char *)&stack_data;
141   other_threads = __kmp_threads;
142 
143   /* ATT: The code below is a source of potential bugs due to unsynchronized
144      access to __kmp_threads array. For example:
145      1. Current thread loads other_threads[i] to thr and checks it, it is
146         non-NULL.
147      2. Current thread is suspended by OS.
148      3. Another thread unregisters and finishes (debug versions of free()
149         may fill memory with something like 0xEF).
150      4. Current thread is resumed.
151      5. Current thread reads junk from *thr.
152      TODO: Fix it.  --ln  */
153 
154   for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157     if (!thr)
158       continue;
159 
160     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163     /* stack grows down -- search through all of the active threads */
164 
165     if (stack_addr <= stack_base) {
166       size_t stack_diff = stack_base - stack_addr;
167 
168       if (stack_diff <= stack_size) {
169         /* The only way we can be closer than the allocated */
170         /* stack size is if we are running on this thread. */
171         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172         return i;
173       }
174     }
175   }
176 
177   /* get specific to try and determine our gtid */
178   KA_TRACE(1000,
179            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180             "thread, using TLS\n"));
181   i = __kmp_gtid_get_specific();
182 
183   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
184 
185   /* if we havn't been assigned a gtid, then return code */
186   if (i < 0)
187     return i;
188 
189   /* dynamically updated stack window for uber threads to avoid get_specific
190      call */
191   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192     KMP_FATAL(StackOverflow, i);
193   }
194 
195   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196   if (stack_addr > stack_base) {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200                 stack_base);
201   } else {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203             stack_base - stack_addr);
204   }
205 
206   /* Reprint stack bounds for ubermaster since they have been refined */
207   if (__kmp_storage_map) {
208     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211                                  other_threads[i]->th.th_info.ds.ds_stacksize,
212                                  "th_%d stack (refinement)", i);
213   }
214   return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218   int gtid;
219 
220   if (!__kmp_init_serial) {
221     gtid = KMP_GTID_DNE;
222   } else
223 #ifdef KMP_TDATA_GTID
224       if (TCR_4(__kmp_gtid_mode) >= 3) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226     gtid = __kmp_gtid;
227   } else
228 #endif
229       if (TCR_4(__kmp_gtid_mode) >= 2) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231     gtid = __kmp_gtid_get_specific();
232   } else {
233     KA_TRACE(1000,
234              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235     gtid = __kmp_get_global_thread_id();
236   }
237 
238   /* we must be a new uber master sibling thread */
239   if (gtid == KMP_GTID_DNE) {
240     KA_TRACE(10,
241              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242               "Registering a new gtid.\n"));
243     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244     if (!__kmp_init_serial) {
245       __kmp_do_serial_initialize();
246       gtid = __kmp_gtid_get_specific();
247     } else {
248       gtid = __kmp_register_root(FALSE);
249     }
250     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252   }
253 
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261   int f;
262   char *stack_beg = NULL;
263   char *stack_end = NULL;
264   int gtid;
265 
266   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267   if (__kmp_storage_map) {
268     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271     gtid = __kmp_gtid_from_thread(th);
272 
273     if (gtid == KMP_GTID_MONITOR) {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%s stack (%s)", "mon",
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     } else {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%d stack (%s)", gtid,
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     }
284   }
285 
286   /* No point in checking ubermaster threads since they use refinement and
287    * cannot overlap */
288   gtid = __kmp_gtid_from_thread(th);
289   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290     KA_TRACE(10,
291              ("__kmp_check_stack_overlap: performing extensive checking\n"));
292     if (stack_beg == NULL) {
293       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295     }
296 
297     for (f = 0; f < __kmp_threads_capacity; f++) {
298       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300       if (f_th && f_th != th) {
301         char *other_stack_end =
302             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303         char *other_stack_beg =
304             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308           /* Print the other stack values before the abort */
309           if (__kmp_storage_map)
310             __kmp_print_storage_map_gtid(
311                 -1, other_stack_beg, other_stack_end,
312                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316                       __kmp_msg_null);
317         }
318       }
319     }
320   }
321   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327   static int done = FALSE;
328 
329   while (!done) {
330     KMP_YIELD(TRUE);
331   }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337                                   char const *format, ...) {
338   char buffer[MAX_MESSAGE];
339   va_list ap;
340 
341   va_start(ap, format);
342   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343                p2, (unsigned long)size, format);
344   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345   __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347   int node;
348   if (gtid >= 0) {
349     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350       if (__kmp_storage_map_verbose) {
351         node = __kmp_get_host_node(p1);
352         if (node < 0) /* doesn't work, so don't try this next time */
353           __kmp_storage_map_verbose = FALSE;
354         else {
355           char *last;
356           int lastNode;
357           int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359           const int page_size = KMP_GET_PAGE_SIZE();
360 
361           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363           if (localProc >= 0)
364             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
365                                  localProc >> 1);
366           else
367             __kmp_printf_no_lock("  GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369           /* The more elaborate format is disabled for now because of the prctl
370            * hanging bug. */
371           do {
372             last = p1;
373             lastNode = node;
374             /* This loop collates adjacent pages with the same host node. */
375             do {
376               (char *)p1 += page_size;
377             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
379                                  lastNode);
380           } while (p1 <= p2);
381 #else
382           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
383                                (char *)p1 + (page_size - 1),
384                                __kmp_get_host_node(p1));
385           if (p1 < p2) {
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
387                                  (char *)p2 + (page_size - 1),
388                                  __kmp_get_host_node(p2));
389           }
390 #endif
391         }
392       }
393     } else
394       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
395   }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401   char buffer[MAX_MESSAGE];
402   va_list ap;
403 
404   if (__kmp_generate_warnings == kmp_warnings_off) {
405     return;
406   }
407 
408   va_start(ap, format);
409 
410   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412   __kmp_vprintf(kmp_err, buffer, ap);
413   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415   va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419   // Later threads may stall here, but that's ok because abort() will kill them.
420   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422   if (__kmp_debug_buf) {
423     __kmp_dump_debug_buffer();
424   }
425 
426   if (KMP_OS_WINDOWS) {
427     // Let other threads know of abnormal termination and prevent deadlock
428     // if abort happened during library initialization or shutdown
429     __kmp_global.g.g_abort = SIGABRT;
430 
431     /* On Windows* OS by default abort() causes pop-up error box, which stalls
432        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433        boxes. _set_abort_behavior() works well, but this function is not
434        available in VS7 (this is not problem for DLL, but it is a problem for
435        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436        help, at least in some versions of MS C RTL.
437 
438        It seems following sequence is the only way to simulate abort() and
439        avoid pop-up error box. */
440     raise(SIGABRT);
441     _exit(3); // Just in case, if signal ignored, exit anyway.
442   } else {
443     abort();
444   }
445 
446   __kmp_infinite_loop();
447   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
448 
449 } // __kmp_abort_process
450 
451 void __kmp_abort_thread(void) {
452   // TODO: Eliminate g_abort global variable and this function.
453   // In case of abort just call abort(), it will kill all the threads.
454   __kmp_infinite_loop();
455 } // __kmp_abort_thread
456 
457 /* Print out the storage map for the major kmp_info_t thread data structures
458    that are allocated together. */
459 
460 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
461   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
462                                gtid);
463 
464   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
465                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
466 
467   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
468                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
469 
470   __kmp_print_storage_map_gtid(
471       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
472       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
475                                &thr->th.th_bar[bs_plain_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
477                                gtid);
478 
479   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
481                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
482                                gtid);
483 
484 #if KMP_FAST_REDUCTION_BARRIER
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
486                                &thr->th.th_bar[bs_reduction_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
488                                gtid);
489 #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /* Print out the storage map for the major kmp_team_t team data structures
493    that are allocated together. */
494 
495 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
496                                          int team_id, int num_thr) {
497   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
498   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
499                                header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
502                                &team->t.t_bar[bs_last_barrier],
503                                sizeof(kmp_balign_team_t) * bs_last_barrier,
504                                "%s_%d.t_bar", header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
507                                &team->t.t_bar[bs_plain_barrier + 1],
508                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
509                                header, team_id);
510 
511   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
512                                &team->t.t_bar[bs_forkjoin_barrier + 1],
513                                sizeof(kmp_balign_team_t),
514                                "%s_%d.t_bar[forkjoin]", header, team_id);
515 
516 #if KMP_FAST_REDUCTION_BARRIER
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
518                                &team->t.t_bar[bs_reduction_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[reduction]", header, team_id);
521 #endif // KMP_FAST_REDUCTION_BARRIER
522 
523   __kmp_print_storage_map_gtid(
524       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
525       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
526 
527   __kmp_print_storage_map_gtid(
528       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
529       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
532                                &team->t.t_disp_buffer[num_disp_buff],
533                                sizeof(dispatch_shared_info_t) * num_disp_buff,
534                                "%s_%d.t_disp_buffer", header, team_id);
535 
536   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
537                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
538                                team_id);
539 }
540 
541 static void __kmp_init_allocator() {
542 #if OMP_50_ENABLED
543   __kmp_init_memkind();
544 #endif
545 }
546 static void __kmp_fini_allocator() {
547 #if OMP_50_ENABLED
548   __kmp_fini_memkind();
549 #endif
550 }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
558   // TODO: Change to __kmp_break_bootstrap_lock().
559   __kmp_init_bootstrap_lock(lck); // make the lock released
560 }
561 
562 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
563   int i;
564   int thread_count;
565 
566   // PROCESS_DETACH is expected to be called by a thread that executes
567   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
568   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
569   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
570   // threads can be still alive here, although being about to be terminated. The
571   // threads in the array with ds_thread==0 are most suspicious. Actually, it
572   // can be not safe to access the __kmp_threads[].
573 
574   // TODO: does it make sense to check __kmp_roots[] ?
575 
576   // Let's check that there are no other alive threads registered with the OMP
577   // lib.
578   while (1) {
579     thread_count = 0;
580     for (i = 0; i < __kmp_threads_capacity; ++i) {
581       if (!__kmp_threads)
582         continue;
583       kmp_info_t *th = __kmp_threads[i];
584       if (th == NULL)
585         continue;
586       int gtid = th->th.th_info.ds.ds_gtid;
587       if (gtid == gtid_req)
588         continue;
589       if (gtid < 0)
590         continue;
591       DWORD exit_val;
592       int alive = __kmp_is_thread_alive(th, &exit_val);
593       if (alive) {
594         ++thread_count;
595       }
596     }
597     if (thread_count == 0)
598       break; // success
599   }
600 
601   // Assume that I'm alone. Now it might be safe to check and reset locks.
602   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
603   __kmp_reset_lock(&__kmp_forkjoin_lock);
604 #ifdef KMP_DEBUG
605   __kmp_reset_lock(&__kmp_stdio_lock);
606 #endif // KMP_DEBUG
607 }
608 
609 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
610   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
611 
612   switch (fdwReason) {
613 
614   case DLL_PROCESS_ATTACH:
615     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
616 
617     return TRUE;
618 
619   case DLL_PROCESS_DETACH:
620     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
621 
622     if (lpReserved != NULL) {
623       // lpReserved is used for telling the difference:
624       //   lpReserved == NULL when FreeLibrary() was called,
625       //   lpReserved != NULL when the process terminates.
626       // When FreeLibrary() is called, worker threads remain alive. So they will
627       // release the forkjoin lock by themselves. When the process terminates,
628       // worker threads disappear triggering the problem of unreleased forkjoin
629       // lock as described below.
630 
631       // A worker thread can take the forkjoin lock. The problem comes up if
632       // that worker thread becomes dead before it releases the forkjoin lock.
633       // The forkjoin lock remains taken, while the thread executing
634       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
635       // to take the forkjoin lock and will always fail, so that the application
636       // will never finish [normally]. This scenario is possible if
637       // __kmpc_end() has not been executed. It looks like it's not a corner
638       // case, but common cases:
639       // - the main function was compiled by an alternative compiler;
640       // - the main function was compiled by icl but without /Qopenmp
641       //   (application with plugins);
642       // - application terminates by calling C exit(), Fortran CALL EXIT() or
643       //   Fortran STOP.
644       // - alive foreign thread prevented __kmpc_end from doing cleanup.
645       //
646       // This is a hack to work around the problem.
647       // TODO: !!! figure out something better.
648       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
649     }
650 
651     __kmp_internal_end_library(__kmp_gtid_get_specific());
652 
653     return TRUE;
654 
655   case DLL_THREAD_ATTACH:
656     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
657 
658     /* if we want to register new siblings all the time here call
659      * __kmp_get_gtid(); */
660     return TRUE;
661 
662   case DLL_THREAD_DETACH:
663     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
664 
665     __kmp_internal_end_thread(__kmp_gtid_get_specific());
666     return TRUE;
667   }
668 
669   return TRUE;
670 }
671 
672 #endif /* KMP_OS_WINDOWS */
673 #endif /* KMP_DYNAMIC_LIB */
674 
675 /* __kmp_parallel_deo -- Wait until it's our turn. */
676 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
677   int gtid = *gtid_ref;
678 #ifdef BUILD_PARALLEL_ORDERED
679   kmp_team_t *team = __kmp_team_from_gtid(gtid);
680 #endif /* BUILD_PARALLEL_ORDERED */
681 
682   if (__kmp_env_consistency_check) {
683     if (__kmp_threads[gtid]->th.th_root->r.r_active)
684 #if KMP_USE_DYNAMIC_LOCK
685       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
686 #else
687       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
688 #endif
689   }
690 #ifdef BUILD_PARALLEL_ORDERED
691   if (!team->t.t_serialized) {
692     KMP_MB();
693     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
694              NULL);
695     KMP_MB();
696   }
697 #endif /* BUILD_PARALLEL_ORDERED */
698 }
699 
700 /* __kmp_parallel_dxo -- Signal the next task. */
701 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
702   int gtid = *gtid_ref;
703 #ifdef BUILD_PARALLEL_ORDERED
704   int tid = __kmp_tid_from_gtid(gtid);
705   kmp_team_t *team = __kmp_team_from_gtid(gtid);
706 #endif /* BUILD_PARALLEL_ORDERED */
707 
708   if (__kmp_env_consistency_check) {
709     if (__kmp_threads[gtid]->th.th_root->r.r_active)
710       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
711   }
712 #ifdef BUILD_PARALLEL_ORDERED
713   if (!team->t.t_serialized) {
714     KMP_MB(); /* Flush all pending memory write invalidates.  */
715 
716     /* use the tid of the next thread in this team */
717     /* TODO replace with general release procedure */
718     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
719 
720     KMP_MB(); /* Flush all pending memory write invalidates.  */
721   }
722 #endif /* BUILD_PARALLEL_ORDERED */
723 }
724 
725 /* ------------------------------------------------------------------------ */
726 /* The BARRIER for a SINGLE process section is always explicit   */
727 
728 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
729   int status;
730   kmp_info_t *th;
731   kmp_team_t *team;
732 
733   if (!TCR_4(__kmp_init_parallel))
734     __kmp_parallel_initialize();
735 
736 #if OMP_50_ENABLED
737   __kmp_resume_if_soft_paused();
738 #endif
739 
740   th = __kmp_threads[gtid];
741   team = th->th.th_team;
742   status = 0;
743 
744   th->th.th_ident = id_ref;
745 
746   if (team->t.t_serialized) {
747     status = 1;
748   } else {
749     kmp_int32 old_this = th->th.th_local.this_construct;
750 
751     ++th->th.th_local.this_construct;
752     /* try to set team count to thread count--success means thread got the
753        single block */
754     /* TODO: Should this be acquire or release? */
755     if (team->t.t_construct == old_this) {
756       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
757                                               th->th.th_local.this_construct);
758     }
759 #if USE_ITT_BUILD
760     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
761         KMP_MASTER_GTID(gtid) &&
762 #if OMP_40_ENABLED
763         th->th.th_teams_microtask == NULL &&
764 #endif
765         team->t.t_active_level ==
766             1) { // Only report metadata by master of active team at level 1
767       __kmp_itt_metadata_single(id_ref);
768     }
769 #endif /* USE_ITT_BUILD */
770   }
771 
772   if (__kmp_env_consistency_check) {
773     if (status && push_ws) {
774       __kmp_push_workshare(gtid, ct_psingle, id_ref);
775     } else {
776       __kmp_check_workshare(gtid, ct_psingle, id_ref);
777     }
778   }
779 #if USE_ITT_BUILD
780   if (status) {
781     __kmp_itt_single_start(gtid);
782   }
783 #endif /* USE_ITT_BUILD */
784   return status;
785 }
786 
787 void __kmp_exit_single(int gtid) {
788 #if USE_ITT_BUILD
789   __kmp_itt_single_end(gtid);
790 #endif /* USE_ITT_BUILD */
791   if (__kmp_env_consistency_check)
792     __kmp_pop_workshare(gtid, ct_psingle, NULL);
793 }
794 
795 /* determine if we can go parallel or must use a serialized parallel region and
796  * how many threads we can use
797  * set_nproc is the number of threads requested for the team
798  * returns 0 if we should serialize or only use one thread,
799  * otherwise the number of threads to use
800  * The forkjoin lock is held by the caller. */
801 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
802                                  int master_tid, int set_nthreads
803 #if OMP_40_ENABLED
804                                  ,
805                                  int enter_teams
806 #endif /* OMP_40_ENABLED */
807                                  ) {
808   int capacity;
809   int new_nthreads;
810   KMP_DEBUG_ASSERT(__kmp_init_serial);
811   KMP_DEBUG_ASSERT(root && parent_team);
812   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
903   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
904   if (cg_nthreads + new_nthreads -
905           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
906       max_cg_threads) {
907     int tl_nthreads = max_cg_threads - cg_nthreads +
908                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
909     if (tl_nthreads <= 0) {
910       tl_nthreads = 1;
911     }
912 
913     // If dyn-var is false, emit a 1-time warning.
914     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
915       __kmp_reserve_warn = 1;
916       __kmp_msg(kmp_ms_warning,
917                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
918                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
919     }
920     if (tl_nthreads == 1) {
921       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
922                     "reduced reservation to 1 thread\n",
923                     master_tid));
924       return 1;
925     }
926     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
927                   "reservation to %d threads\n",
928                   master_tid, tl_nthreads));
929     new_nthreads = tl_nthreads;
930   }
931 
932   // Check if the threads array is large enough, or needs expanding.
933   // See comment in __kmp_register_root() about the adjustment if
934   // __kmp_threads[0] == NULL.
935   capacity = __kmp_threads_capacity;
936   if (TCR_PTR(__kmp_threads[0]) == NULL) {
937     --capacity;
938   }
939   if (__kmp_nth + new_nthreads -
940           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
941       capacity) {
942     // Expand the threads array.
943     int slotsRequired = __kmp_nth + new_nthreads -
944                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
945                         capacity;
946     int slotsAdded = __kmp_expand_threads(slotsRequired);
947     if (slotsAdded < slotsRequired) {
948       // The threads array was not expanded enough.
949       new_nthreads -= (slotsRequired - slotsAdded);
950       KMP_ASSERT(new_nthreads >= 1);
951 
952       // If dyn-var is false, emit a 1-time warning.
953       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
954         __kmp_reserve_warn = 1;
955         if (__kmp_tp_cached) {
956           __kmp_msg(kmp_ms_warning,
957                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
958                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
959                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
960         } else {
961           __kmp_msg(kmp_ms_warning,
962                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
963                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
964         }
965       }
966     }
967   }
968 
969 #ifdef KMP_DEBUG
970   if (new_nthreads == 1) {
971     KC_TRACE(10,
972              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
973               "dead roots and rechecking; requested %d threads\n",
974               __kmp_get_gtid(), set_nthreads));
975   } else {
976     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
977                   " %d threads\n",
978                   __kmp_get_gtid(), new_nthreads, set_nthreads));
979   }
980 #endif // KMP_DEBUG
981   return new_nthreads;
982 }
983 
984 /* Allocate threads from the thread pool and assign them to the new team. We are
985    assured that there are enough threads available, because we checked on that
986    earlier within critical section forkjoin */
987 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
988                                     kmp_info_t *master_th, int master_gtid) {
989   int i;
990   int use_hot_team;
991 
992   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
993   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
994   KMP_MB();
995 
996   /* first, let's setup the master thread */
997   master_th->th.th_info.ds.ds_tid = 0;
998   master_th->th.th_team = team;
999   master_th->th.th_team_nproc = team->t.t_nproc;
1000   master_th->th.th_team_master = master_th;
1001   master_th->th.th_team_serialized = FALSE;
1002   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1003 
1004 /* make sure we are not the optimized hot team */
1005 #if KMP_NESTED_HOT_TEAMS
1006   use_hot_team = 0;
1007   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1008   if (hot_teams) { // hot teams array is not allocated if
1009     // KMP_HOT_TEAMS_MAX_LEVEL=0
1010     int level = team->t.t_active_level - 1; // index in array of hot teams
1011     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1012       if (master_th->th.th_teams_size.nteams > 1) {
1013         ++level; // level was not increased in teams construct for
1014         // team_of_masters
1015       }
1016       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1017           master_th->th.th_teams_level == team->t.t_level) {
1018         ++level; // level was not increased in teams construct for
1019         // team_of_workers before the parallel
1020       } // team->t.t_level will be increased inside parallel
1021     }
1022     if (level < __kmp_hot_teams_max_level) {
1023       if (hot_teams[level].hot_team) {
1024         // hot team has already been allocated for given level
1025         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1026         use_hot_team = 1; // the team is ready to use
1027       } else {
1028         use_hot_team = 0; // AC: threads are not allocated yet
1029         hot_teams[level].hot_team = team; // remember new hot team
1030         hot_teams[level].hot_team_nth = team->t.t_nproc;
1031       }
1032     } else {
1033       use_hot_team = 0;
1034     }
1035   }
1036 #else
1037   use_hot_team = team == root->r.r_hot_team;
1038 #endif
1039   if (!use_hot_team) {
1040 
1041     /* install the master thread */
1042     team->t.t_threads[0] = master_th;
1043     __kmp_initialize_info(master_th, team, 0, master_gtid);
1044 
1045     /* now, install the worker threads */
1046     for (i = 1; i < team->t.t_nproc; i++) {
1047 
1048       /* fork or reallocate a new thread and install it in team */
1049       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1050       team->t.t_threads[i] = thr;
1051       KMP_DEBUG_ASSERT(thr);
1052       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1053       /* align team and thread arrived states */
1054       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1055                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1056                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1057                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1058                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1059                     team->t.t_bar[bs_plain_barrier].b_arrived));
1060 #if OMP_40_ENABLED
1061       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1062       thr->th.th_teams_level = master_th->th.th_teams_level;
1063       thr->th.th_teams_size = master_th->th.th_teams_size;
1064 #endif
1065       { // Initialize threads' barrier data.
1066         int b;
1067         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1068         for (b = 0; b < bs_last_barrier; ++b) {
1069           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1070           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1071 #if USE_DEBUGGER
1072           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1073 #endif
1074         }
1075       }
1076     }
1077 
1078 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1079     __kmp_partition_places(team);
1080 #endif
1081   }
1082 
1083 #if OMP_50_ENABLED
1084   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1085     for (i = 0; i < team->t.t_nproc; i++) {
1086       kmp_info_t *thr = team->t.t_threads[i];
1087       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1088           thr->th.th_prev_level != team->t.t_level) {
1089         team->t.t_display_affinity = 1;
1090         break;
1091       }
1092     }
1093   }
1094 #endif
1095 
1096   KMP_MB();
1097 }
1098 
1099 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1100 // Propagate any changes to the floating point control registers out to the team
1101 // We try to avoid unnecessary writes to the relevant cache line in the team
1102 // structure, so we don't make changes unless they are needed.
1103 inline static void propagateFPControl(kmp_team_t *team) {
1104   if (__kmp_inherit_fp_control) {
1105     kmp_int16 x87_fpu_control_word;
1106     kmp_uint32 mxcsr;
1107 
1108     // Get master values of FPU control flags (both X87 and vector)
1109     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1110     __kmp_store_mxcsr(&mxcsr);
1111     mxcsr &= KMP_X86_MXCSR_MASK;
1112 
1113     // There is no point looking at t_fp_control_saved here.
1114     // If it is TRUE, we still have to update the values if they are different
1115     // from those we now have. If it is FALSE we didn't save anything yet, but
1116     // our objective is the same. We have to ensure that the values in the team
1117     // are the same as those we have.
1118     // So, this code achieves what we need whether or not t_fp_control_saved is
1119     // true. By checking whether the value needs updating we avoid unnecessary
1120     // writes that would put the cache-line into a written state, causing all
1121     // threads in the team to have to read it again.
1122     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1123     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1124     // Although we don't use this value, other code in the runtime wants to know
1125     // whether it should restore them. So we must ensure it is correct.
1126     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1127   } else {
1128     // Similarly here. Don't write to this cache-line in the team structure
1129     // unless we have to.
1130     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1131   }
1132 }
1133 
1134 // Do the opposite, setting the hardware registers to the updated values from
1135 // the team.
1136 inline static void updateHWFPControl(kmp_team_t *team) {
1137   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1138     // Only reset the fp control regs if they have been changed in the team.
1139     // the parallel region that we are exiting.
1140     kmp_int16 x87_fpu_control_word;
1141     kmp_uint32 mxcsr;
1142     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1143     __kmp_store_mxcsr(&mxcsr);
1144     mxcsr &= KMP_X86_MXCSR_MASK;
1145 
1146     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1147       __kmp_clear_x87_fpu_status_word();
1148       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1149     }
1150 
1151     if (team->t.t_mxcsr != mxcsr) {
1152       __kmp_load_mxcsr(&team->t.t_mxcsr);
1153     }
1154   }
1155 }
1156 #else
1157 #define propagateFPControl(x) ((void)0)
1158 #define updateHWFPControl(x) ((void)0)
1159 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1160 
1161 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1162                                      int realloc); // forward declaration
1163 
1164 /* Run a parallel region that has been serialized, so runs only in a team of the
1165    single master thread. */
1166 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1167   kmp_info_t *this_thr;
1168   kmp_team_t *serial_team;
1169 
1170   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1171 
1172   /* Skip all this code for autopar serialized loops since it results in
1173      unacceptable overhead */
1174   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1175     return;
1176 
1177   if (!TCR_4(__kmp_init_parallel))
1178     __kmp_parallel_initialize();
1179 
1180 #if OMP_50_ENABLED
1181   __kmp_resume_if_soft_paused();
1182 #endif
1183 
1184   this_thr = __kmp_threads[global_tid];
1185   serial_team = this_thr->th.th_serial_team;
1186 
1187   /* utilize the serialized team held by this thread */
1188   KMP_DEBUG_ASSERT(serial_team);
1189   KMP_MB();
1190 
1191   if (__kmp_tasking_mode != tskm_immediate_exec) {
1192     KMP_DEBUG_ASSERT(
1193         this_thr->th.th_task_team ==
1194         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1195     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1196                      NULL);
1197     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1198                   "team %p, new task_team = NULL\n",
1199                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1200     this_thr->th.th_task_team = NULL;
1201   }
1202 
1203 #if OMP_40_ENABLED
1204   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1205   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1206     proc_bind = proc_bind_false;
1207   } else if (proc_bind == proc_bind_default) {
1208     // No proc_bind clause was specified, so use the current value
1209     // of proc-bind-var for this parallel region.
1210     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1211   }
1212   // Reset for next parallel region
1213   this_thr->th.th_set_proc_bind = proc_bind_default;
1214 #endif /* OMP_40_ENABLED */
1215 
1216 #if OMPT_SUPPORT
1217   ompt_data_t ompt_parallel_data = ompt_data_none;
1218   ompt_data_t *implicit_task_data;
1219   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1220   if (ompt_enabled.enabled &&
1221       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1222 
1223     ompt_task_info_t *parent_task_info;
1224     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1225 
1226     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1227     if (ompt_enabled.ompt_callback_parallel_begin) {
1228       int team_size = 1;
1229 
1230       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1231           &(parent_task_info->task_data), &(parent_task_info->frame),
1232           &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1233           codeptr);
1234     }
1235   }
1236 #endif // OMPT_SUPPORT
1237 
1238   if (this_thr->th.th_team != serial_team) {
1239     // Nested level will be an index in the nested nthreads array
1240     int level = this_thr->th.th_team->t.t_level;
1241 
1242     if (serial_team->t.t_serialized) {
1243       /* this serial team was already used
1244          TODO increase performance by making this locks more specific */
1245       kmp_team_t *new_team;
1246 
1247       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1248 
1249       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1250 #if OMPT_SUPPORT
1251                                      ompt_parallel_data,
1252 #endif
1253 #if OMP_40_ENABLED
1254                                      proc_bind,
1255 #endif
1256                                      &this_thr->th.th_current_task->td_icvs,
1257                                      0 USE_NESTED_HOT_ARG(NULL));
1258       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1259       KMP_ASSERT(new_team);
1260 
1261       /* setup new serialized team and install it */
1262       new_team->t.t_threads[0] = this_thr;
1263       new_team->t.t_parent = this_thr->th.th_team;
1264       serial_team = new_team;
1265       this_thr->th.th_serial_team = serial_team;
1266 
1267       KF_TRACE(
1268           10,
1269           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1270            global_tid, serial_team));
1271 
1272       /* TODO the above breaks the requirement that if we run out of resources,
1273          then we can still guarantee that serialized teams are ok, since we may
1274          need to allocate a new one */
1275     } else {
1276       KF_TRACE(
1277           10,
1278           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1279            global_tid, serial_team));
1280     }
1281 
1282     /* we have to initialize this serial team */
1283     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1284     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1285     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1286     serial_team->t.t_ident = loc;
1287     serial_team->t.t_serialized = 1;
1288     serial_team->t.t_nproc = 1;
1289     serial_team->t.t_parent = this_thr->th.th_team;
1290     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1291     this_thr->th.th_team = serial_team;
1292     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1293 
1294     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1295                   this_thr->th.th_current_task));
1296     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1297     this_thr->th.th_current_task->td_flags.executing = 0;
1298 
1299     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1300 
1301     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1302        implicit task for each serialized task represented by
1303        team->t.t_serialized? */
1304     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1305               &this_thr->th.th_current_task->td_parent->td_icvs);
1306 
1307     // Thread value exists in the nested nthreads array for the next nested
1308     // level
1309     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1310       this_thr->th.th_current_task->td_icvs.nproc =
1311           __kmp_nested_nth.nth[level + 1];
1312     }
1313 
1314 #if OMP_40_ENABLED
1315     if (__kmp_nested_proc_bind.used &&
1316         (level + 1 < __kmp_nested_proc_bind.used)) {
1317       this_thr->th.th_current_task->td_icvs.proc_bind =
1318           __kmp_nested_proc_bind.bind_types[level + 1];
1319     }
1320 #endif /* OMP_40_ENABLED */
1321 
1322 #if USE_DEBUGGER
1323     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1324 #endif
1325     this_thr->th.th_info.ds.ds_tid = 0;
1326 
1327     /* set thread cache values */
1328     this_thr->th.th_team_nproc = 1;
1329     this_thr->th.th_team_master = this_thr;
1330     this_thr->th.th_team_serialized = 1;
1331 
1332     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1333     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1334 #if OMP_50_ENABLED
1335     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1336 #endif
1337 
1338     propagateFPControl(serial_team);
1339 
1340     /* check if we need to allocate dispatch buffers stack */
1341     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1342     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1343       serial_team->t.t_dispatch->th_disp_buffer =
1344           (dispatch_private_info_t *)__kmp_allocate(
1345               sizeof(dispatch_private_info_t));
1346     }
1347     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1348 
1349     KMP_MB();
1350 
1351   } else {
1352     /* this serialized team is already being used,
1353      * that's fine, just add another nested level */
1354     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1355     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1356     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1357     ++serial_team->t.t_serialized;
1358     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1359 
1360     // Nested level will be an index in the nested nthreads array
1361     int level = this_thr->th.th_team->t.t_level;
1362     // Thread value exists in the nested nthreads array for the next nested
1363     // level
1364     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1365       this_thr->th.th_current_task->td_icvs.nproc =
1366           __kmp_nested_nth.nth[level + 1];
1367     }
1368     serial_team->t.t_level++;
1369     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1370                   "of serial team %p to %d\n",
1371                   global_tid, serial_team, serial_team->t.t_level));
1372 
1373     /* allocate/push dispatch buffers stack */
1374     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1375     {
1376       dispatch_private_info_t *disp_buffer =
1377           (dispatch_private_info_t *)__kmp_allocate(
1378               sizeof(dispatch_private_info_t));
1379       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1380       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1381     }
1382     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1383 
1384     KMP_MB();
1385   }
1386 #if OMP_40_ENABLED
1387   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1388 #endif
1389 
1390 #if OMP_50_ENABLED
1391   // Perform the display affinity functionality for
1392   // serialized parallel regions
1393   if (__kmp_display_affinity) {
1394     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1395         this_thr->th.th_prev_num_threads != 1) {
1396       // NULL means use the affinity-format-var ICV
1397       __kmp_aux_display_affinity(global_tid, NULL);
1398       this_thr->th.th_prev_level = serial_team->t.t_level;
1399       this_thr->th.th_prev_num_threads = 1;
1400     }
1401   }
1402 #endif
1403 
1404   if (__kmp_env_consistency_check)
1405     __kmp_push_parallel(global_tid, NULL);
1406 #if OMPT_SUPPORT
1407   serial_team->t.ompt_team_info.master_return_address = codeptr;
1408   if (ompt_enabled.enabled &&
1409       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1410     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1411 
1412     ompt_lw_taskteam_t lw_taskteam;
1413     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1414                             &ompt_parallel_data, codeptr);
1415 
1416     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1417     // don't use lw_taskteam after linking. content was swaped
1418 
1419     /* OMPT implicit task begin */
1420     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1421     if (ompt_enabled.ompt_callback_implicit_task) {
1422       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1423           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1424           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1425       OMPT_CUR_TASK_INFO(this_thr)
1426           ->thread_num = __kmp_tid_from_gtid(global_tid);
1427     }
1428 
1429     /* OMPT state */
1430     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1431     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1432   }
1433 #endif
1434 }
1435 
1436 /* most of the work for a fork */
1437 /* return true if we really went parallel, false if serialized */
1438 int __kmp_fork_call(ident_t *loc, int gtid,
1439                     enum fork_context_e call_context, // Intel, GNU, ...
1440                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1441 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1442 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1443                     va_list *ap
1444 #else
1445                     va_list ap
1446 #endif
1447                     ) {
1448   void **argv;
1449   int i;
1450   int master_tid;
1451   int master_this_cons;
1452   kmp_team_t *team;
1453   kmp_team_t *parent_team;
1454   kmp_info_t *master_th;
1455   kmp_root_t *root;
1456   int nthreads;
1457   int master_active;
1458   int master_set_numthreads;
1459   int level;
1460 #if OMP_40_ENABLED
1461   int active_level;
1462   int teams_level;
1463 #endif
1464 #if KMP_NESTED_HOT_TEAMS
1465   kmp_hot_team_ptr_t **p_hot_teams;
1466 #endif
1467   { // KMP_TIME_BLOCK
1468     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1469     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1470 
1471     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1472     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1473       /* Some systems prefer the stack for the root thread(s) to start with */
1474       /* some gap from the parent stack to prevent false sharing. */
1475       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1476       /* These 2 lines below are so this does not get optimized out */
1477       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1478         __kmp_stkpadding += (short)((kmp_int64)dummy);
1479     }
1480 
1481     /* initialize if needed */
1482     KMP_DEBUG_ASSERT(
1483         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1484     if (!TCR_4(__kmp_init_parallel))
1485       __kmp_parallel_initialize();
1486 
1487 #if OMP_50_ENABLED
1488     __kmp_resume_if_soft_paused();
1489 #endif
1490 
1491     /* setup current data */
1492     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1493     // shutdown
1494     parent_team = master_th->th.th_team;
1495     master_tid = master_th->th.th_info.ds.ds_tid;
1496     master_this_cons = master_th->th.th_local.this_construct;
1497     root = master_th->th.th_root;
1498     master_active = root->r.r_active;
1499     master_set_numthreads = master_th->th.th_set_nproc;
1500 
1501 #if OMPT_SUPPORT
1502     ompt_data_t ompt_parallel_data = ompt_data_none;
1503     ompt_data_t *parent_task_data;
1504     ompt_frame_t *ompt_frame;
1505     ompt_data_t *implicit_task_data;
1506     void *return_address = NULL;
1507 
1508     if (ompt_enabled.enabled) {
1509       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1510                                     NULL, NULL);
1511       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1512     }
1513 #endif
1514 
1515     // Nested level will be an index in the nested nthreads array
1516     level = parent_team->t.t_level;
1517     // used to launch non-serial teams even if nested is not allowed
1518     active_level = parent_team->t.t_active_level;
1519 #if OMP_40_ENABLED
1520     // needed to check nesting inside the teams
1521     teams_level = master_th->th.th_teams_level;
1522 #endif
1523 #if KMP_NESTED_HOT_TEAMS
1524     p_hot_teams = &master_th->th.th_hot_teams;
1525     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1526       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1527           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1528       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1529       // it is either actual or not needed (when active_level > 0)
1530       (*p_hot_teams)[0].hot_team_nth = 1;
1531     }
1532 #endif
1533 
1534 #if OMPT_SUPPORT
1535     if (ompt_enabled.enabled) {
1536       if (ompt_enabled.ompt_callback_parallel_begin) {
1537         int team_size = master_set_numthreads
1538                             ? master_set_numthreads
1539                             : get__nproc_2(parent_team, master_tid);
1540         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1541             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1542             OMPT_INVOKER(call_context), return_address);
1543       }
1544       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1545     }
1546 #endif
1547 
1548     master_th->th.th_ident = loc;
1549 
1550 #if OMP_40_ENABLED
1551     if (master_th->th.th_teams_microtask && ap &&
1552         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1553       // AC: This is start of parallel that is nested inside teams construct.
1554       // The team is actual (hot), all workers are ready at the fork barrier.
1555       // No lock needed to initialize the team a bit, then free workers.
1556       parent_team->t.t_ident = loc;
1557       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1558       parent_team->t.t_argc = argc;
1559       argv = (void **)parent_team->t.t_argv;
1560       for (i = argc - 1; i >= 0; --i)
1561 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1562 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1563         *argv++ = va_arg(*ap, void *);
1564 #else
1565         *argv++ = va_arg(ap, void *);
1566 #endif
1567       // Increment our nested depth levels, but not increase the serialization
1568       if (parent_team == master_th->th.th_serial_team) {
1569         // AC: we are in serialized parallel
1570         __kmpc_serialized_parallel(loc, gtid);
1571         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1572         // AC: need this in order enquiry functions work
1573         // correctly, will restore at join time
1574         parent_team->t.t_serialized--;
1575 #if OMPT_SUPPORT
1576         void *dummy;
1577         void **exit_runtime_p;
1578 
1579         ompt_lw_taskteam_t lw_taskteam;
1580 
1581         if (ompt_enabled.enabled) {
1582           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1583                                   &ompt_parallel_data, return_address);
1584           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1585 
1586           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1587           // don't use lw_taskteam after linking. content was swaped
1588 
1589           /* OMPT implicit task begin */
1590           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1591           if (ompt_enabled.ompt_callback_implicit_task) {
1592             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1593                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1594                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1595             OMPT_CUR_TASK_INFO(master_th)
1596                 ->thread_num = __kmp_tid_from_gtid(gtid);
1597           }
1598 
1599           /* OMPT state */
1600           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1601         } else {
1602           exit_runtime_p = &dummy;
1603         }
1604 #endif
1605 
1606         {
1607           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1608           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1609           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1610 #if OMPT_SUPPORT
1611                                  ,
1612                                  exit_runtime_p
1613 #endif
1614                                  );
1615         }
1616 
1617 #if OMPT_SUPPORT
1618         *exit_runtime_p = NULL;
1619         if (ompt_enabled.enabled) {
1620           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1621           if (ompt_enabled.ompt_callback_implicit_task) {
1622             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1623                 ompt_scope_end, NULL, implicit_task_data, 1,
1624                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1625           }
1626           __ompt_lw_taskteam_unlink(master_th);
1627 
1628           if (ompt_enabled.ompt_callback_parallel_end) {
1629             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1630                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1631                 OMPT_INVOKER(call_context), return_address);
1632           }
1633           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1634         }
1635 #endif
1636         return TRUE;
1637       }
1638 
1639       parent_team->t.t_pkfn = microtask;
1640       parent_team->t.t_invoke = invoker;
1641       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1642       parent_team->t.t_active_level++;
1643       parent_team->t.t_level++;
1644 #if OMP_50_ENABLED
1645       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1646 #endif
1647 
1648       /* Change number of threads in the team if requested */
1649       if (master_set_numthreads) { // The parallel has num_threads clause
1650         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1651           // AC: only can reduce number of threads dynamically, can't increase
1652           kmp_info_t **other_threads = parent_team->t.t_threads;
1653           parent_team->t.t_nproc = master_set_numthreads;
1654           for (i = 0; i < master_set_numthreads; ++i) {
1655             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1656           }
1657           // Keep extra threads hot in the team for possible next parallels
1658         }
1659         master_th->th.th_set_nproc = 0;
1660       }
1661 
1662 #if USE_DEBUGGER
1663       if (__kmp_debugging) { // Let debugger override number of threads.
1664         int nth = __kmp_omp_num_threads(loc);
1665         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1666           master_set_numthreads = nth;
1667         }
1668       }
1669 #endif
1670 
1671       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1672                     "master_th=%p, gtid=%d\n",
1673                     root, parent_team, master_th, gtid));
1674       __kmp_internal_fork(loc, gtid, parent_team);
1675       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1676                     "master_th=%p, gtid=%d\n",
1677                     root, parent_team, master_th, gtid));
1678 
1679       /* Invoke microtask for MASTER thread */
1680       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1681                     parent_team->t.t_id, parent_team->t.t_pkfn));
1682 
1683       if (!parent_team->t.t_invoke(gtid)) {
1684         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1685       }
1686       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1687                     parent_team->t.t_id, parent_team->t.t_pkfn));
1688       KMP_MB(); /* Flush all pending memory write invalidates.  */
1689 
1690       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1691 
1692       return TRUE;
1693     } // Parallel closely nested in teams construct
1694 #endif /* OMP_40_ENABLED */
1695 
1696 #if KMP_DEBUG
1697     if (__kmp_tasking_mode != tskm_immediate_exec) {
1698       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1699                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1700     }
1701 #endif
1702 
1703     if (parent_team->t.t_active_level >=
1704         master_th->th.th_current_task->td_icvs.max_active_levels) {
1705       nthreads = 1;
1706     } else {
1707 #if OMP_40_ENABLED
1708       int enter_teams = ((ap == NULL && active_level == 0) ||
1709                          (ap && teams_level > 0 && teams_level == level));
1710 #endif
1711       nthreads =
1712           master_set_numthreads
1713               ? master_set_numthreads
1714               : get__nproc_2(
1715                     parent_team,
1716                     master_tid); // TODO: get nproc directly from current task
1717 
1718       // Check if we need to take forkjoin lock? (no need for serialized
1719       // parallel out of teams construct). This code moved here from
1720       // __kmp_reserve_threads() to speedup nested serialized parallels.
1721       if (nthreads > 1) {
1722         if ((get__max_active_levels(master_th) == 1 && (root->r.r_in_parallel
1723 #if OMP_40_ENABLED
1724                                                         && !enter_teams
1725 #endif /* OMP_40_ENABLED */
1726                                                         )) ||
1727             (__kmp_library == library_serial)) {
1728           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1729                         " threads\n",
1730                         gtid, nthreads));
1731           nthreads = 1;
1732         }
1733       }
1734       if (nthreads > 1) {
1735         /* determine how many new threads we can use */
1736         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1737         nthreads = __kmp_reserve_threads(
1738             root, parent_team, master_tid, nthreads
1739 #if OMP_40_ENABLED
1740             /* AC: If we execute teams from parallel region (on host), then
1741                teams should be created but each can only have 1 thread if
1742                nesting is disabled. If teams called from serial region, then
1743                teams and their threads should be created regardless of the
1744                nesting setting. */
1745             ,
1746             enter_teams
1747 #endif /* OMP_40_ENABLED */
1748             );
1749         if (nthreads == 1) {
1750           // Free lock for single thread execution here; for multi-thread
1751           // execution it will be freed later after team of threads created
1752           // and initialized
1753           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1754         }
1755       }
1756     }
1757     KMP_DEBUG_ASSERT(nthreads > 0);
1758 
1759     // If we temporarily changed the set number of threads then restore it now
1760     master_th->th.th_set_nproc = 0;
1761 
1762     /* create a serialized parallel region? */
1763     if (nthreads == 1) {
1764 /* josh todo: hypothetical question: what do we do for OS X*? */
1765 #if KMP_OS_LINUX &&                                                            \
1766     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1767       void *args[argc];
1768 #else
1769       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1770 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1771           KMP_ARCH_AARCH64) */
1772 
1773       KA_TRACE(20,
1774                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1775 
1776       __kmpc_serialized_parallel(loc, gtid);
1777 
1778       if (call_context == fork_context_intel) {
1779         /* TODO this sucks, use the compiler itself to pass args! :) */
1780         master_th->th.th_serial_team->t.t_ident = loc;
1781 #if OMP_40_ENABLED
1782         if (!ap) {
1783           // revert change made in __kmpc_serialized_parallel()
1784           master_th->th.th_serial_team->t.t_level--;
1785 // Get args from parent team for teams construct
1786 
1787 #if OMPT_SUPPORT
1788           void *dummy;
1789           void **exit_runtime_p;
1790           ompt_task_info_t *task_info;
1791 
1792           ompt_lw_taskteam_t lw_taskteam;
1793 
1794           if (ompt_enabled.enabled) {
1795             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1796                                     &ompt_parallel_data, return_address);
1797 
1798             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1799             // don't use lw_taskteam after linking. content was swaped
1800 
1801             task_info = OMPT_CUR_TASK_INFO(master_th);
1802             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1803             if (ompt_enabled.ompt_callback_implicit_task) {
1804               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1806                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1807               OMPT_CUR_TASK_INFO(master_th)
1808                   ->thread_num = __kmp_tid_from_gtid(gtid);
1809             }
1810 
1811             /* OMPT state */
1812             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1813           } else {
1814             exit_runtime_p = &dummy;
1815           }
1816 #endif
1817 
1818           {
1819             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1820             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1821             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1822                                    parent_team->t.t_argv
1823 #if OMPT_SUPPORT
1824                                    ,
1825                                    exit_runtime_p
1826 #endif
1827                                    );
1828           }
1829 
1830 #if OMPT_SUPPORT
1831           if (ompt_enabled.enabled) {
1832             exit_runtime_p = NULL;
1833             if (ompt_enabled.ompt_callback_implicit_task) {
1834               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1835                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1836                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1837             }
1838 
1839             __ompt_lw_taskteam_unlink(master_th);
1840             if (ompt_enabled.ompt_callback_parallel_end) {
1841               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1842                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1843                   OMPT_INVOKER(call_context), return_address);
1844             }
1845             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1846           }
1847 #endif
1848         } else if (microtask == (microtask_t)__kmp_teams_master) {
1849           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1850                            master_th->th.th_serial_team);
1851           team = master_th->th.th_team;
1852           // team->t.t_pkfn = microtask;
1853           team->t.t_invoke = invoker;
1854           __kmp_alloc_argv_entries(argc, team, TRUE);
1855           team->t.t_argc = argc;
1856           argv = (void **)team->t.t_argv;
1857           if (ap) {
1858             for (i = argc - 1; i >= 0; --i)
1859 // TODO: revert workaround for Intel(R) 64 tracker #96
1860 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1861               *argv++ = va_arg(*ap, void *);
1862 #else
1863               *argv++ = va_arg(ap, void *);
1864 #endif
1865           } else {
1866             for (i = 0; i < argc; ++i)
1867               // Get args from parent team for teams construct
1868               argv[i] = parent_team->t.t_argv[i];
1869           }
1870           // AC: revert change made in __kmpc_serialized_parallel()
1871           //     because initial code in teams should have level=0
1872           team->t.t_level--;
1873           // AC: call special invoker for outer "parallel" of teams construct
1874           invoker(gtid);
1875         } else {
1876 #endif /* OMP_40_ENABLED */
1877           argv = args;
1878           for (i = argc - 1; i >= 0; --i)
1879 // TODO: revert workaround for Intel(R) 64 tracker #96
1880 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1881             *argv++ = va_arg(*ap, void *);
1882 #else
1883           *argv++ = va_arg(ap, void *);
1884 #endif
1885           KMP_MB();
1886 
1887 #if OMPT_SUPPORT
1888           void *dummy;
1889           void **exit_runtime_p;
1890           ompt_task_info_t *task_info;
1891 
1892           ompt_lw_taskteam_t lw_taskteam;
1893 
1894           if (ompt_enabled.enabled) {
1895             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1896                                     &ompt_parallel_data, return_address);
1897             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1898             // don't use lw_taskteam after linking. content was swaped
1899             task_info = OMPT_CUR_TASK_INFO(master_th);
1900             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1901 
1902             /* OMPT implicit task begin */
1903             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1904             if (ompt_enabled.ompt_callback_implicit_task) {
1905               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1906                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1907                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1908               OMPT_CUR_TASK_INFO(master_th)
1909                   ->thread_num = __kmp_tid_from_gtid(gtid);
1910             }
1911 
1912             /* OMPT state */
1913             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1914           } else {
1915             exit_runtime_p = &dummy;
1916           }
1917 #endif
1918 
1919           {
1920             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1921             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1922             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1923 #if OMPT_SUPPORT
1924                                    ,
1925                                    exit_runtime_p
1926 #endif
1927                                    );
1928           }
1929 
1930 #if OMPT_SUPPORT
1931           if (ompt_enabled.enabled) {
1932             *exit_runtime_p = NULL;
1933             if (ompt_enabled.ompt_callback_implicit_task) {
1934               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1935                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1936                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1937             }
1938 
1939             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1940             __ompt_lw_taskteam_unlink(master_th);
1941             if (ompt_enabled.ompt_callback_parallel_end) {
1942               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1943                   &ompt_parallel_data, parent_task_data,
1944                   OMPT_INVOKER(call_context), return_address);
1945             }
1946             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1947           }
1948 #endif
1949 #if OMP_40_ENABLED
1950         }
1951 #endif /* OMP_40_ENABLED */
1952       } else if (call_context == fork_context_gnu) {
1953 #if OMPT_SUPPORT
1954         ompt_lw_taskteam_t lwt;
1955         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1956                                 return_address);
1957 
1958         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1959         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1960 // don't use lw_taskteam after linking. content was swaped
1961 #endif
1962 
1963         // we were called from GNU native code
1964         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1965         return FALSE;
1966       } else {
1967         KMP_ASSERT2(call_context < fork_context_last,
1968                     "__kmp_fork_call: unknown fork_context parameter");
1969       }
1970 
1971       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1972       KMP_MB();
1973       return FALSE;
1974     } // if (nthreads == 1)
1975 
1976     // GEH: only modify the executing flag in the case when not serialized
1977     //      serialized case is handled in kmpc_serialized_parallel
1978     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1979                   "curtask=%p, curtask_max_aclevel=%d\n",
1980                   parent_team->t.t_active_level, master_th,
1981                   master_th->th.th_current_task,
1982                   master_th->th.th_current_task->td_icvs.max_active_levels));
1983     // TODO: GEH - cannot do this assertion because root thread not set up as
1984     // executing
1985     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1986     master_th->th.th_current_task->td_flags.executing = 0;
1987 
1988 #if OMP_40_ENABLED
1989     if (!master_th->th.th_teams_microtask || level > teams_level)
1990 #endif /* OMP_40_ENABLED */
1991     {
1992       /* Increment our nested depth level */
1993       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1994     }
1995 
1996     // See if we need to make a copy of the ICVs.
1997     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1998     if ((level + 1 < __kmp_nested_nth.used) &&
1999         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2000       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2001     } else {
2002       nthreads_icv = 0; // don't update
2003     }
2004 
2005 #if OMP_40_ENABLED
2006     // Figure out the proc_bind_policy for the new team.
2007     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2008     kmp_proc_bind_t proc_bind_icv =
2009         proc_bind_default; // proc_bind_default means don't update
2010     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2011       proc_bind = proc_bind_false;
2012     } else {
2013       if (proc_bind == proc_bind_default) {
2014         // No proc_bind clause specified; use current proc-bind-var for this
2015         // parallel region
2016         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2017       }
2018       /* else: The proc_bind policy was specified explicitly on parallel clause.
2019          This overrides proc-bind-var for this parallel region, but does not
2020          change proc-bind-var. */
2021       // Figure the value of proc-bind-var for the child threads.
2022       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2023           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2024            master_th->th.th_current_task->td_icvs.proc_bind)) {
2025         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2026       }
2027     }
2028 
2029     // Reset for next parallel region
2030     master_th->th.th_set_proc_bind = proc_bind_default;
2031 #endif /* OMP_40_ENABLED */
2032 
2033     if ((nthreads_icv > 0)
2034 #if OMP_40_ENABLED
2035         || (proc_bind_icv != proc_bind_default)
2036 #endif /* OMP_40_ENABLED */
2037             ) {
2038       kmp_internal_control_t new_icvs;
2039       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2040       new_icvs.next = NULL;
2041       if (nthreads_icv > 0) {
2042         new_icvs.nproc = nthreads_icv;
2043       }
2044 
2045 #if OMP_40_ENABLED
2046       if (proc_bind_icv != proc_bind_default) {
2047         new_icvs.proc_bind = proc_bind_icv;
2048       }
2049 #endif /* OMP_40_ENABLED */
2050 
2051       /* allocate a new parallel team */
2052       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2053       team = __kmp_allocate_team(root, nthreads, nthreads,
2054 #if OMPT_SUPPORT
2055                                  ompt_parallel_data,
2056 #endif
2057 #if OMP_40_ENABLED
2058                                  proc_bind,
2059 #endif
2060                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2061     } else {
2062       /* allocate a new parallel team */
2063       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2064       team = __kmp_allocate_team(root, nthreads, nthreads,
2065 #if OMPT_SUPPORT
2066                                  ompt_parallel_data,
2067 #endif
2068 #if OMP_40_ENABLED
2069                                  proc_bind,
2070 #endif
2071                                  &master_th->th.th_current_task->td_icvs,
2072                                  argc USE_NESTED_HOT_ARG(master_th));
2073     }
2074     KF_TRACE(
2075         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2076 
2077     /* setup the new team */
2078     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2079     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2080     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2081     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2082     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2083 #if OMPT_SUPPORT
2084     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2085                           return_address);
2086 #endif
2087     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2088 // TODO: parent_team->t.t_level == INT_MAX ???
2089 #if OMP_40_ENABLED
2090     if (!master_th->th.th_teams_microtask || level > teams_level) {
2091 #endif /* OMP_40_ENABLED */
2092       int new_level = parent_team->t.t_level + 1;
2093       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2094       new_level = parent_team->t.t_active_level + 1;
2095       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2096 #if OMP_40_ENABLED
2097     } else {
2098       // AC: Do not increase parallel level at start of the teams construct
2099       int new_level = parent_team->t.t_level;
2100       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2101       new_level = parent_team->t.t_active_level;
2102       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2103     }
2104 #endif /* OMP_40_ENABLED */
2105     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2106     // set master's schedule as new run-time schedule
2107     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2108 
2109 #if OMP_40_ENABLED
2110     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2111 #endif
2112 #if OMP_50_ENABLED
2113     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2114 #endif
2115 
2116     // Update the floating point rounding in the team if required.
2117     propagateFPControl(team);
2118 
2119     if (__kmp_tasking_mode != tskm_immediate_exec) {
2120       // Set master's task team to team's task team. Unless this is hot team, it
2121       // should be NULL.
2122       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2123                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2124       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2125                     "%p, new task_team %p / team %p\n",
2126                     __kmp_gtid_from_thread(master_th),
2127                     master_th->th.th_task_team, parent_team,
2128                     team->t.t_task_team[master_th->th.th_task_state], team));
2129 
2130       if (active_level || master_th->th.th_task_team) {
2131         // Take a memo of master's task_state
2132         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2133         if (master_th->th.th_task_state_top >=
2134             master_th->th.th_task_state_stack_sz) { // increase size
2135           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2136           kmp_uint8 *old_stack, *new_stack;
2137           kmp_uint32 i;
2138           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2139           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2140             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2141           }
2142           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2143                ++i) { // zero-init rest of stack
2144             new_stack[i] = 0;
2145           }
2146           old_stack = master_th->th.th_task_state_memo_stack;
2147           master_th->th.th_task_state_memo_stack = new_stack;
2148           master_th->th.th_task_state_stack_sz = new_size;
2149           __kmp_free(old_stack);
2150         }
2151         // Store master's task_state on stack
2152         master_th->th
2153             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2154             master_th->th.th_task_state;
2155         master_th->th.th_task_state_top++;
2156 #if KMP_NESTED_HOT_TEAMS
2157         if (master_th->th.th_hot_teams &&
2158             active_level < __kmp_hot_teams_max_level &&
2159             team == master_th->th.th_hot_teams[active_level].hot_team) {
2160           // Restore master's nested state if nested hot team
2161           master_th->th.th_task_state =
2162               master_th->th
2163                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2164         } else {
2165 #endif
2166           master_th->th.th_task_state = 0;
2167 #if KMP_NESTED_HOT_TEAMS
2168         }
2169 #endif
2170       }
2171 #if !KMP_NESTED_HOT_TEAMS
2172       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2173                        (team == root->r.r_hot_team));
2174 #endif
2175     }
2176 
2177     KA_TRACE(
2178         20,
2179         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2180          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2181          team->t.t_nproc));
2182     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2183                      (team->t.t_master_tid == 0 &&
2184                       (team->t.t_parent == root->r.r_root_team ||
2185                        team->t.t_parent->t.t_serialized)));
2186     KMP_MB();
2187 
2188     /* now, setup the arguments */
2189     argv = (void **)team->t.t_argv;
2190 #if OMP_40_ENABLED
2191     if (ap) {
2192 #endif /* OMP_40_ENABLED */
2193       for (i = argc - 1; i >= 0; --i) {
2194 // TODO: revert workaround for Intel(R) 64 tracker #96
2195 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2196         void *new_argv = va_arg(*ap, void *);
2197 #else
2198       void *new_argv = va_arg(ap, void *);
2199 #endif
2200         KMP_CHECK_UPDATE(*argv, new_argv);
2201         argv++;
2202       }
2203 #if OMP_40_ENABLED
2204     } else {
2205       for (i = 0; i < argc; ++i) {
2206         // Get args from parent team for teams construct
2207         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2208       }
2209     }
2210 #endif /* OMP_40_ENABLED */
2211 
2212     /* now actually fork the threads */
2213     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2214     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2215       root->r.r_active = TRUE;
2216 
2217     __kmp_fork_team_threads(root, team, master_th, gtid);
2218     __kmp_setup_icv_copy(team, nthreads,
2219                          &master_th->th.th_current_task->td_icvs, loc);
2220 
2221 #if OMPT_SUPPORT
2222     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2223 #endif
2224 
2225     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2226 
2227 #if USE_ITT_BUILD
2228     if (team->t.t_active_level == 1 // only report frames at level 1
2229 #if OMP_40_ENABLED
2230         && !master_th->th.th_teams_microtask // not in teams construct
2231 #endif /* OMP_40_ENABLED */
2232         ) {
2233 #if USE_ITT_NOTIFY
2234       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2235           (__kmp_forkjoin_frames_mode == 3 ||
2236            __kmp_forkjoin_frames_mode == 1)) {
2237         kmp_uint64 tmp_time = 0;
2238         if (__itt_get_timestamp_ptr)
2239           tmp_time = __itt_get_timestamp();
2240         // Internal fork - report frame begin
2241         master_th->th.th_frame_time = tmp_time;
2242         if (__kmp_forkjoin_frames_mode == 3)
2243           team->t.t_region_time = tmp_time;
2244       } else
2245 // only one notification scheme (either "submit" or "forking/joined", not both)
2246 #endif /* USE_ITT_NOTIFY */
2247           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2248               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2249         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2250         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2251       }
2252     }
2253 #endif /* USE_ITT_BUILD */
2254 
2255     /* now go on and do the work */
2256     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2257     KMP_MB();
2258     KF_TRACE(10,
2259              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2260               root, team, master_th, gtid));
2261 
2262 #if USE_ITT_BUILD
2263     if (__itt_stack_caller_create_ptr) {
2264       team->t.t_stack_id =
2265           __kmp_itt_stack_caller_create(); // create new stack stitching id
2266       // before entering fork barrier
2267     }
2268 #endif /* USE_ITT_BUILD */
2269 
2270 #if OMP_40_ENABLED
2271     // AC: skip __kmp_internal_fork at teams construct, let only master
2272     // threads execute
2273     if (ap)
2274 #endif /* OMP_40_ENABLED */
2275     {
2276       __kmp_internal_fork(loc, gtid, team);
2277       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2278                     "master_th=%p, gtid=%d\n",
2279                     root, team, master_th, gtid));
2280     }
2281 
2282     if (call_context == fork_context_gnu) {
2283       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2284       return TRUE;
2285     }
2286 
2287     /* Invoke microtask for MASTER thread */
2288     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2289                   team->t.t_id, team->t.t_pkfn));
2290   } // END of timer KMP_fork_call block
2291 
2292   if (!team->t.t_invoke(gtid)) {
2293     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2294   }
2295   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296                 team->t.t_id, team->t.t_pkfn));
2297   KMP_MB(); /* Flush all pending memory write invalidates.  */
2298 
2299   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 
2301 #if OMPT_SUPPORT
2302   if (ompt_enabled.enabled) {
2303     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2304   }
2305 #endif
2306 
2307   return TRUE;
2308 }
2309 
2310 #if OMPT_SUPPORT
2311 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2312                                             kmp_team_t *team) {
2313   // restore state outside the region
2314   thread->th.ompt_thread_info.state =
2315       ((team->t.t_serialized) ? ompt_state_work_serial
2316                               : ompt_state_work_parallel);
2317 }
2318 
2319 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2320                                    kmp_team_t *team, ompt_data_t *parallel_data,
2321                                    fork_context_e fork_context, void *codeptr) {
2322   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2323   if (ompt_enabled.ompt_callback_parallel_end) {
2324     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2325         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2326         codeptr);
2327   }
2328 
2329   task_info->frame.enter_frame = ompt_data_none;
2330   __kmp_join_restore_state(thread, team);
2331 }
2332 #endif
2333 
2334 void __kmp_join_call(ident_t *loc, int gtid
2335 #if OMPT_SUPPORT
2336                      ,
2337                      enum fork_context_e fork_context
2338 #endif
2339 #if OMP_40_ENABLED
2340                      ,
2341                      int exit_teams
2342 #endif /* OMP_40_ENABLED */
2343                      ) {
2344   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2345   kmp_team_t *team;
2346   kmp_team_t *parent_team;
2347   kmp_info_t *master_th;
2348   kmp_root_t *root;
2349   int master_active;
2350 
2351   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2352 
2353   /* setup current data */
2354   master_th = __kmp_threads[gtid];
2355   root = master_th->th.th_root;
2356   team = master_th->th.th_team;
2357   parent_team = team->t.t_parent;
2358 
2359   master_th->th.th_ident = loc;
2360 
2361 #if OMPT_SUPPORT
2362   if (ompt_enabled.enabled) {
2363     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364   }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370                   "th_task_team = %p\n",
2371                   __kmp_gtid_from_thread(master_th), team,
2372                   team->t.t_task_team[master_th->th.th_task_state],
2373                   master_th->th.th_task_team));
2374     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375                      team->t.t_task_team[master_th->th.th_task_state]);
2376   }
2377 #endif
2378 
2379   if (team->t.t_serialized) {
2380 #if OMP_40_ENABLED
2381     if (master_th->th.th_teams_microtask) {
2382       // We are in teams construct
2383       int level = team->t.t_level;
2384       int tlevel = master_th->th.th_teams_level;
2385       if (level == tlevel) {
2386         // AC: we haven't incremented it earlier at start of teams construct,
2387         //     so do it here - at the end of teams construct
2388         team->t.t_level++;
2389       } else if (level == tlevel + 1) {
2390         // AC: we are exiting parallel inside teams, need to increment
2391         // serialization in order to restore it in the next call to
2392         // __kmpc_end_serialized_parallel
2393         team->t.t_serialized++;
2394       }
2395     }
2396 #endif /* OMP_40_ENABLED */
2397     __kmpc_end_serialized_parallel(loc, gtid);
2398 
2399 #if OMPT_SUPPORT
2400     if (ompt_enabled.enabled) {
2401       __kmp_join_restore_state(master_th, parent_team);
2402     }
2403 #endif
2404 
2405     return;
2406   }
2407 
2408   master_active = team->t.t_master_active;
2409 
2410 #if OMP_40_ENABLED
2411   if (!exit_teams)
2412 #endif /* OMP_40_ENABLED */
2413   {
2414     // AC: No barrier for internal teams at exit from teams construct.
2415     //     But there is barrier for external team (league).
2416     __kmp_internal_join(loc, gtid, team);
2417   }
2418 #if OMP_40_ENABLED
2419   else {
2420     master_th->th.th_task_state =
2421         0; // AC: no tasking in teams (out of any parallel)
2422   }
2423 #endif /* OMP_40_ENABLED */
2424 
2425   KMP_MB();
2426 
2427 #if OMPT_SUPPORT
2428   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2429   void *codeptr = team->t.ompt_team_info.master_return_address;
2430 #endif
2431 
2432 #if USE_ITT_BUILD
2433   if (__itt_stack_caller_create_ptr) {
2434     __kmp_itt_stack_caller_destroy(
2435         (__itt_caller)team->t
2436             .t_stack_id); // destroy the stack stitching id after join barrier
2437   }
2438 
2439   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2440   if (team->t.t_active_level == 1
2441 #if OMP_40_ENABLED
2442       && !master_th->th.th_teams_microtask /* not in teams construct */
2443 #endif /* OMP_40_ENABLED */
2444       ) {
2445     master_th->th.th_ident = loc;
2446     // only one notification scheme (either "submit" or "forking/joined", not
2447     // both)
2448     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2449         __kmp_forkjoin_frames_mode == 3)
2450       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2451                              master_th->th.th_frame_time, 0, loc,
2452                              master_th->th.th_team_nproc, 1);
2453     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2454              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2455       __kmp_itt_region_joined(gtid);
2456   } // active_level == 1
2457 #endif /* USE_ITT_BUILD */
2458 
2459 #if OMP_40_ENABLED
2460   if (master_th->th.th_teams_microtask && !exit_teams &&
2461       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2462       team->t.t_level == master_th->th.th_teams_level + 1) {
2463     // AC: We need to leave the team structure intact at the end of parallel
2464     // inside the teams construct, so that at the next parallel same (hot) team
2465     // works, only adjust nesting levels
2466 
2467     /* Decrement our nested depth level */
2468     team->t.t_level--;
2469     team->t.t_active_level--;
2470     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2471 
2472     // Restore number of threads in the team if needed. This code relies on
2473     // the proper adjustment of th_teams_size.nth after the fork in
2474     // __kmp_teams_master on each teams master in the case that
2475     // __kmp_reserve_threads reduced it.
2476     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2477       int old_num = master_th->th.th_team_nproc;
2478       int new_num = master_th->th.th_teams_size.nth;
2479       kmp_info_t **other_threads = team->t.t_threads;
2480       team->t.t_nproc = new_num;
2481       for (int i = 0; i < old_num; ++i) {
2482         other_threads[i]->th.th_team_nproc = new_num;
2483       }
2484       // Adjust states of non-used threads of the team
2485       for (int i = old_num; i < new_num; ++i) {
2486         // Re-initialize thread's barrier data.
2487         KMP_DEBUG_ASSERT(other_threads[i]);
2488         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2489         for (int b = 0; b < bs_last_barrier; ++b) {
2490           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2491           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2492 #if USE_DEBUGGER
2493           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2494 #endif
2495         }
2496         if (__kmp_tasking_mode != tskm_immediate_exec) {
2497           // Synchronize thread's task state
2498           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2499         }
2500       }
2501     }
2502 
2503 #if OMPT_SUPPORT
2504     if (ompt_enabled.enabled) {
2505       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2506                       codeptr);
2507     }
2508 #endif
2509 
2510     return;
2511   }
2512 #endif /* OMP_40_ENABLED */
2513 
2514   /* do cleanup and restore the parent team */
2515   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2516   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2517 
2518   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2519 
2520   /* jc: The following lock has instructions with REL and ACQ semantics,
2521      separating the parallel user code called in this parallel region
2522      from the serial user code called after this function returns. */
2523   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2524 
2525 #if OMP_40_ENABLED
2526   if (!master_th->th.th_teams_microtask ||
2527       team->t.t_level > master_th->th.th_teams_level)
2528 #endif /* OMP_40_ENABLED */
2529   {
2530     /* Decrement our nested depth level */
2531     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2532   }
2533   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2534 
2535 #if OMPT_SUPPORT
2536   if (ompt_enabled.enabled) {
2537     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2538     if (ompt_enabled.ompt_callback_implicit_task) {
2539       int ompt_team_size = team->t.t_nproc;
2540       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2541           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2542           OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2543     }
2544 
2545     task_info->frame.exit_frame = ompt_data_none;
2546     task_info->task_data = ompt_data_none;
2547   }
2548 #endif
2549 
2550   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2551                 master_th, team));
2552   __kmp_pop_current_task_from_thread(master_th);
2553 
2554 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2555   // Restore master thread's partition.
2556   master_th->th.th_first_place = team->t.t_first_place;
2557   master_th->th.th_last_place = team->t.t_last_place;
2558 #endif /* OMP_40_ENABLED */
2559 #if OMP_50_ENABLED
2560   master_th->th.th_def_allocator = team->t.t_def_allocator;
2561 #endif
2562 
2563   updateHWFPControl(team);
2564 
2565   if (root->r.r_active != master_active)
2566     root->r.r_active = master_active;
2567 
2568   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2569                             master_th)); // this will free worker threads
2570 
2571   /* this race was fun to find. make sure the following is in the critical
2572      region otherwise assertions may fail occasionally since the old team may be
2573      reallocated and the hierarchy appears inconsistent. it is actually safe to
2574      run and won't cause any bugs, but will cause those assertion failures. it's
2575      only one deref&assign so might as well put this in the critical region */
2576   master_th->th.th_team = parent_team;
2577   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2578   master_th->th.th_team_master = parent_team->t.t_threads[0];
2579   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2580 
2581   /* restore serialized team, if need be */
2582   if (parent_team->t.t_serialized &&
2583       parent_team != master_th->th.th_serial_team &&
2584       parent_team != root->r.r_root_team) {
2585     __kmp_free_team(root,
2586                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2587     master_th->th.th_serial_team = parent_team;
2588   }
2589 
2590   if (__kmp_tasking_mode != tskm_immediate_exec) {
2591     if (master_th->th.th_task_state_top >
2592         0) { // Restore task state from memo stack
2593       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2594       // Remember master's state if we re-use this nested hot team
2595       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2596           master_th->th.th_task_state;
2597       --master_th->th.th_task_state_top; // pop
2598       // Now restore state at this level
2599       master_th->th.th_task_state =
2600           master_th->th
2601               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2602     }
2603     // Copy the task team from the parent team to the master thread
2604     master_th->th.th_task_team =
2605         parent_team->t.t_task_team[master_th->th.th_task_state];
2606     KA_TRACE(20,
2607              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2608               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2609               parent_team));
2610   }
2611 
2612   // TODO: GEH - cannot do this assertion because root thread not set up as
2613   // executing
2614   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2615   master_th->th.th_current_task->td_flags.executing = 1;
2616 
2617   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2618 
2619 #if OMPT_SUPPORT
2620   if (ompt_enabled.enabled) {
2621     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2622                     codeptr);
2623   }
2624 #endif
2625 
2626   KMP_MB();
2627   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2628 }
2629 
2630 /* Check whether we should push an internal control record onto the
2631    serial team stack.  If so, do it.  */
2632 void __kmp_save_internal_controls(kmp_info_t *thread) {
2633 
2634   if (thread->th.th_team != thread->th.th_serial_team) {
2635     return;
2636   }
2637   if (thread->th.th_team->t.t_serialized > 1) {
2638     int push = 0;
2639 
2640     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2641       push = 1;
2642     } else {
2643       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2644           thread->th.th_team->t.t_serialized) {
2645         push = 1;
2646       }
2647     }
2648     if (push) { /* push a record on the serial team's stack */
2649       kmp_internal_control_t *control =
2650           (kmp_internal_control_t *)__kmp_allocate(
2651               sizeof(kmp_internal_control_t));
2652 
2653       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2654 
2655       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2656 
2657       control->next = thread->th.th_team->t.t_control_stack_top;
2658       thread->th.th_team->t.t_control_stack_top = control;
2659     }
2660   }
2661 }
2662 
2663 /* Changes set_nproc */
2664 void __kmp_set_num_threads(int new_nth, int gtid) {
2665   kmp_info_t *thread;
2666   kmp_root_t *root;
2667 
2668   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2669   KMP_DEBUG_ASSERT(__kmp_init_serial);
2670 
2671   if (new_nth < 1)
2672     new_nth = 1;
2673   else if (new_nth > __kmp_max_nth)
2674     new_nth = __kmp_max_nth;
2675 
2676   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2677   thread = __kmp_threads[gtid];
2678   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2679     return; // nothing to do
2680 
2681   __kmp_save_internal_controls(thread);
2682 
2683   set__nproc(thread, new_nth);
2684 
2685   // If this omp_set_num_threads() call will cause the hot team size to be
2686   // reduced (in the absence of a num_threads clause), then reduce it now,
2687   // rather than waiting for the next parallel region.
2688   root = thread->th.th_root;
2689   if (__kmp_init_parallel && (!root->r.r_active) &&
2690       (root->r.r_hot_team->t.t_nproc > new_nth)
2691 #if KMP_NESTED_HOT_TEAMS
2692       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2693 #endif
2694       ) {
2695     kmp_team_t *hot_team = root->r.r_hot_team;
2696     int f;
2697 
2698     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2699 
2700     // Release the extra threads we don't need any more.
2701     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2702       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2703       if (__kmp_tasking_mode != tskm_immediate_exec) {
2704         // When decreasing team size, threads no longer in the team should unref
2705         // task team.
2706         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2707       }
2708       __kmp_free_thread(hot_team->t.t_threads[f]);
2709       hot_team->t.t_threads[f] = NULL;
2710     }
2711     hot_team->t.t_nproc = new_nth;
2712 #if KMP_NESTED_HOT_TEAMS
2713     if (thread->th.th_hot_teams) {
2714       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2715       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2716     }
2717 #endif
2718 
2719     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2720 
2721     // Update the t_nproc field in the threads that are still active.
2722     for (f = 0; f < new_nth; f++) {
2723       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2724       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2725     }
2726     // Special flag in case omp_set_num_threads() call
2727     hot_team->t.t_size_changed = -1;
2728   }
2729 }
2730 
2731 /* Changes max_active_levels */
2732 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2733   kmp_info_t *thread;
2734 
2735   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2736                 "%d = (%d)\n",
2737                 gtid, max_active_levels));
2738   KMP_DEBUG_ASSERT(__kmp_init_serial);
2739 
2740   // validate max_active_levels
2741   if (max_active_levels < 0) {
2742     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2743     // We ignore this call if the user has specified a negative value.
2744     // The current setting won't be changed. The last valid setting will be
2745     // used. A warning will be issued (if warnings are allowed as controlled by
2746     // the KMP_WARNINGS env var).
2747     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2748                   "max_active_levels for thread %d = (%d)\n",
2749                   gtid, max_active_levels));
2750     return;
2751   }
2752   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2753     // it's OK, the max_active_levels is within the valid range: [ 0;
2754     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2755     // We allow a zero value. (implementation defined behavior)
2756   } else {
2757     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2758                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2759     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2760     // Current upper limit is MAX_INT. (implementation defined behavior)
2761     // If the input exceeds the upper limit, we correct the input to be the
2762     // upper limit. (implementation defined behavior)
2763     // Actually, the flow should never get here until we use MAX_INT limit.
2764   }
2765   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2766                 "max_active_levels for thread %d = (%d)\n",
2767                 gtid, max_active_levels));
2768 
2769   thread = __kmp_threads[gtid];
2770 
2771   __kmp_save_internal_controls(thread);
2772 
2773   set__max_active_levels(thread, max_active_levels);
2774 }
2775 
2776 /* Gets max_active_levels */
2777 int __kmp_get_max_active_levels(int gtid) {
2778   kmp_info_t *thread;
2779 
2780   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2781   KMP_DEBUG_ASSERT(__kmp_init_serial);
2782 
2783   thread = __kmp_threads[gtid];
2784   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2785   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2786                 "curtask_maxaclevel=%d\n",
2787                 gtid, thread->th.th_current_task,
2788                 thread->th.th_current_task->td_icvs.max_active_levels));
2789   return thread->th.th_current_task->td_icvs.max_active_levels;
2790 }
2791 
2792 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2793 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2794   kmp_info_t *thread;
2795   //    kmp_team_t *team;
2796 
2797   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2798                 gtid, (int)kind, chunk));
2799   KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801   // Check if the kind parameter is valid, correct if needed.
2802   // Valid parameters should fit in one of two intervals - standard or extended:
2803   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2804   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2805   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2806       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2807     // TODO: Hint needs attention in case we change the default schedule.
2808     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2809               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2810               __kmp_msg_null);
2811     kind = kmp_sched_default;
2812     chunk = 0; // ignore chunk value in case of bad kind
2813   }
2814 
2815   thread = __kmp_threads[gtid];
2816 
2817   __kmp_save_internal_controls(thread);
2818 
2819   if (kind < kmp_sched_upper_std) {
2820     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2821       // differ static chunked vs. unchunked:  chunk should be invalid to
2822       // indicate unchunked schedule (which is the default)
2823       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2824     } else {
2825       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2826           __kmp_sch_map[kind - kmp_sched_lower - 1];
2827     }
2828   } else {
2829     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2830     //    kmp_sched_lower - 2 ];
2831     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2832         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2833                       kmp_sched_lower - 2];
2834   }
2835   if (kind == kmp_sched_auto || chunk < 1) {
2836     // ignore parameter chunk for schedule auto
2837     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2838   } else {
2839     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2840   }
2841 }
2842 
2843 /* Gets def_sched_var ICV values */
2844 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2845   kmp_info_t *thread;
2846   enum sched_type th_type;
2847 
2848   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2849   KMP_DEBUG_ASSERT(__kmp_init_serial);
2850 
2851   thread = __kmp_threads[gtid];
2852 
2853   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2854 
2855   switch (th_type) {
2856   case kmp_sch_static:
2857   case kmp_sch_static_greedy:
2858   case kmp_sch_static_balanced:
2859     *kind = kmp_sched_static;
2860     *chunk = 0; // chunk was not set, try to show this fact via zero value
2861     return;
2862   case kmp_sch_static_chunked:
2863     *kind = kmp_sched_static;
2864     break;
2865   case kmp_sch_dynamic_chunked:
2866     *kind = kmp_sched_dynamic;
2867     break;
2868   case kmp_sch_guided_chunked:
2869   case kmp_sch_guided_iterative_chunked:
2870   case kmp_sch_guided_analytical_chunked:
2871     *kind = kmp_sched_guided;
2872     break;
2873   case kmp_sch_auto:
2874     *kind = kmp_sched_auto;
2875     break;
2876   case kmp_sch_trapezoidal:
2877     *kind = kmp_sched_trapezoidal;
2878     break;
2879 #if KMP_STATIC_STEAL_ENABLED
2880   case kmp_sch_static_steal:
2881     *kind = kmp_sched_static_steal;
2882     break;
2883 #endif
2884   default:
2885     KMP_FATAL(UnknownSchedulingType, th_type);
2886   }
2887 
2888   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2889 }
2890 
2891 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2892 
2893   int ii, dd;
2894   kmp_team_t *team;
2895   kmp_info_t *thr;
2896 
2897   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2898   KMP_DEBUG_ASSERT(__kmp_init_serial);
2899 
2900   // validate level
2901   if (level == 0)
2902     return 0;
2903   if (level < 0)
2904     return -1;
2905   thr = __kmp_threads[gtid];
2906   team = thr->th.th_team;
2907   ii = team->t.t_level;
2908   if (level > ii)
2909     return -1;
2910 
2911 #if OMP_40_ENABLED
2912   if (thr->th.th_teams_microtask) {
2913     // AC: we are in teams region where multiple nested teams have same level
2914     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2915     if (level <=
2916         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2917       KMP_DEBUG_ASSERT(ii >= tlevel);
2918       // AC: As we need to pass by the teams league, we need to artificially
2919       // increase ii
2920       if (ii == tlevel) {
2921         ii += 2; // three teams have same level
2922       } else {
2923         ii++; // two teams have same level
2924       }
2925     }
2926   }
2927 #endif
2928 
2929   if (ii == level)
2930     return __kmp_tid_from_gtid(gtid);
2931 
2932   dd = team->t.t_serialized;
2933   level++;
2934   while (ii > level) {
2935     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2936     }
2937     if ((team->t.t_serialized) && (!dd)) {
2938       team = team->t.t_parent;
2939       continue;
2940     }
2941     if (ii > level) {
2942       team = team->t.t_parent;
2943       dd = team->t.t_serialized;
2944       ii--;
2945     }
2946   }
2947 
2948   return (dd > 1) ? (0) : (team->t.t_master_tid);
2949 }
2950 
2951 int __kmp_get_team_size(int gtid, int level) {
2952 
2953   int ii, dd;
2954   kmp_team_t *team;
2955   kmp_info_t *thr;
2956 
2957   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2958   KMP_DEBUG_ASSERT(__kmp_init_serial);
2959 
2960   // validate level
2961   if (level == 0)
2962     return 1;
2963   if (level < 0)
2964     return -1;
2965   thr = __kmp_threads[gtid];
2966   team = thr->th.th_team;
2967   ii = team->t.t_level;
2968   if (level > ii)
2969     return -1;
2970 
2971 #if OMP_40_ENABLED
2972   if (thr->th.th_teams_microtask) {
2973     // AC: we are in teams region where multiple nested teams have same level
2974     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2975     if (level <=
2976         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2977       KMP_DEBUG_ASSERT(ii >= tlevel);
2978       // AC: As we need to pass by the teams league, we need to artificially
2979       // increase ii
2980       if (ii == tlevel) {
2981         ii += 2; // three teams have same level
2982       } else {
2983         ii++; // two teams have same level
2984       }
2985     }
2986   }
2987 #endif
2988 
2989   while (ii > level) {
2990     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2991     }
2992     if (team->t.t_serialized && (!dd)) {
2993       team = team->t.t_parent;
2994       continue;
2995     }
2996     if (ii > level) {
2997       team = team->t.t_parent;
2998       ii--;
2999     }
3000   }
3001 
3002   return team->t.t_nproc;
3003 }
3004 
3005 kmp_r_sched_t __kmp_get_schedule_global() {
3006   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3007   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3008   // independently. So one can get the updated schedule here.
3009 
3010   kmp_r_sched_t r_sched;
3011 
3012   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3013   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3014   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3015   // different roots (even in OMP 2.5)
3016   if (__kmp_sched == kmp_sch_static) {
3017     // replace STATIC with more detailed schedule (balanced or greedy)
3018     r_sched.r_sched_type = __kmp_static;
3019   } else if (__kmp_sched == kmp_sch_guided_chunked) {
3020     // replace GUIDED with more detailed schedule (iterative or analytical)
3021     r_sched.r_sched_type = __kmp_guided;
3022   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3023     r_sched.r_sched_type = __kmp_sched;
3024   }
3025 
3026   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3027     // __kmp_chunk may be wrong here (if it was not ever set)
3028     r_sched.chunk = KMP_DEFAULT_CHUNK;
3029   } else {
3030     r_sched.chunk = __kmp_chunk;
3031   }
3032 
3033   return r_sched;
3034 }
3035 
3036 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3037    at least argc number of *t_argv entries for the requested team. */
3038 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3039 
3040   KMP_DEBUG_ASSERT(team);
3041   if (!realloc || argc > team->t.t_max_argc) {
3042 
3043     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3044                    "current entries=%d\n",
3045                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3046     /* if previously allocated heap space for args, free them */
3047     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3048       __kmp_free((void *)team->t.t_argv);
3049 
3050     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3051       /* use unused space in the cache line for arguments */
3052       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3053       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3054                      "argv entries\n",
3055                      team->t.t_id, team->t.t_max_argc));
3056       team->t.t_argv = &team->t.t_inline_argv[0];
3057       if (__kmp_storage_map) {
3058         __kmp_print_storage_map_gtid(
3059             -1, &team->t.t_inline_argv[0],
3060             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3061             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3062             team->t.t_id);
3063       }
3064     } else {
3065       /* allocate space for arguments in the heap */
3066       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3067                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3068                                : 2 * argc;
3069       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3070                      "argv entries\n",
3071                      team->t.t_id, team->t.t_max_argc));
3072       team->t.t_argv =
3073           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3074       if (__kmp_storage_map) {
3075         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3076                                      &team->t.t_argv[team->t.t_max_argc],
3077                                      sizeof(void *) * team->t.t_max_argc,
3078                                      "team_%d.t_argv", team->t.t_id);
3079       }
3080     }
3081   }
3082 }
3083 
3084 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3085   int i;
3086   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3087   team->t.t_threads =
3088       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3089   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3090       sizeof(dispatch_shared_info_t) * num_disp_buff);
3091   team->t.t_dispatch =
3092       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3093   team->t.t_implicit_task_taskdata =
3094       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3095   team->t.t_max_nproc = max_nth;
3096 
3097   /* setup dispatch buffers */
3098   for (i = 0; i < num_disp_buff; ++i) {
3099     team->t.t_disp_buffer[i].buffer_index = i;
3100 #if OMP_45_ENABLED
3101     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3102 #endif
3103   }
3104 }
3105 
3106 static void __kmp_free_team_arrays(kmp_team_t *team) {
3107   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3108   int i;
3109   for (i = 0; i < team->t.t_max_nproc; ++i) {
3110     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3111       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3112       team->t.t_dispatch[i].th_disp_buffer = NULL;
3113     }
3114   }
3115 #if KMP_USE_HIER_SCHED
3116   __kmp_dispatch_free_hierarchies(team);
3117 #endif
3118   __kmp_free(team->t.t_threads);
3119   __kmp_free(team->t.t_disp_buffer);
3120   __kmp_free(team->t.t_dispatch);
3121   __kmp_free(team->t.t_implicit_task_taskdata);
3122   team->t.t_threads = NULL;
3123   team->t.t_disp_buffer = NULL;
3124   team->t.t_dispatch = NULL;
3125   team->t.t_implicit_task_taskdata = 0;
3126 }
3127 
3128 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3129   kmp_info_t **oldThreads = team->t.t_threads;
3130 
3131   __kmp_free(team->t.t_disp_buffer);
3132   __kmp_free(team->t.t_dispatch);
3133   __kmp_free(team->t.t_implicit_task_taskdata);
3134   __kmp_allocate_team_arrays(team, max_nth);
3135 
3136   KMP_MEMCPY(team->t.t_threads, oldThreads,
3137              team->t.t_nproc * sizeof(kmp_info_t *));
3138 
3139   __kmp_free(oldThreads);
3140 }
3141 
3142 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3143 
3144   kmp_r_sched_t r_sched =
3145       __kmp_get_schedule_global(); // get current state of scheduling globals
3146 
3147 #if OMP_40_ENABLED
3148   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3149 #endif /* OMP_40_ENABLED */
3150 
3151   kmp_internal_control_t g_icvs = {
3152     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3153     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3154     // adjustment of threads (per thread)
3155     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3156     // whether blocktime is explicitly set
3157     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3158 #if KMP_USE_MONITOR
3159     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3160 // intervals
3161 #endif
3162     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3163     // next parallel region (per thread)
3164     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3165     __kmp_cg_max_nth, // int thread_limit;
3166     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3167     // for max_active_levels
3168     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3169 // {sched,chunk} pair
3170 #if OMP_40_ENABLED
3171     __kmp_nested_proc_bind.bind_types[0],
3172     __kmp_default_device,
3173 #endif /* OMP_40_ENABLED */
3174     NULL // struct kmp_internal_control *next;
3175   };
3176 
3177   return g_icvs;
3178 }
3179 
3180 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3181 
3182   kmp_internal_control_t gx_icvs;
3183   gx_icvs.serial_nesting_level =
3184       0; // probably =team->t.t_serial like in save_inter_controls
3185   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3186   gx_icvs.next = NULL;
3187 
3188   return gx_icvs;
3189 }
3190 
3191 static void __kmp_initialize_root(kmp_root_t *root) {
3192   int f;
3193   kmp_team_t *root_team;
3194   kmp_team_t *hot_team;
3195   int hot_team_max_nth;
3196   kmp_r_sched_t r_sched =
3197       __kmp_get_schedule_global(); // get current state of scheduling globals
3198   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3199   KMP_DEBUG_ASSERT(root);
3200   KMP_ASSERT(!root->r.r_begin);
3201 
3202   /* setup the root state structure */
3203   __kmp_init_lock(&root->r.r_begin_lock);
3204   root->r.r_begin = FALSE;
3205   root->r.r_active = FALSE;
3206   root->r.r_in_parallel = 0;
3207   root->r.r_blocktime = __kmp_dflt_blocktime;
3208 
3209   /* setup the root team for this task */
3210   /* allocate the root team structure */
3211   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3212 
3213   root_team =
3214       __kmp_allocate_team(root,
3215                           1, // new_nproc
3216                           1, // max_nproc
3217 #if OMPT_SUPPORT
3218                           ompt_data_none, // root parallel id
3219 #endif
3220 #if OMP_40_ENABLED
3221                           __kmp_nested_proc_bind.bind_types[0],
3222 #endif
3223                           &r_icvs,
3224                           0 // argc
3225                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3226                           );
3227 #if USE_DEBUGGER
3228   // Non-NULL value should be assigned to make the debugger display the root
3229   // team.
3230   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3231 #endif
3232 
3233   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3234 
3235   root->r.r_root_team = root_team;
3236   root_team->t.t_control_stack_top = NULL;
3237 
3238   /* initialize root team */
3239   root_team->t.t_threads[0] = NULL;
3240   root_team->t.t_nproc = 1;
3241   root_team->t.t_serialized = 1;
3242   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3243   root_team->t.t_sched.sched = r_sched.sched;
3244   KA_TRACE(
3245       20,
3246       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3247        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3248 
3249   /* setup the  hot team for this task */
3250   /* allocate the hot team structure */
3251   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3252 
3253   hot_team =
3254       __kmp_allocate_team(root,
3255                           1, // new_nproc
3256                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3257 #if OMPT_SUPPORT
3258                           ompt_data_none, // root parallel id
3259 #endif
3260 #if OMP_40_ENABLED
3261                           __kmp_nested_proc_bind.bind_types[0],
3262 #endif
3263                           &r_icvs,
3264                           0 // argc
3265                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3266                           );
3267   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3268 
3269   root->r.r_hot_team = hot_team;
3270   root_team->t.t_control_stack_top = NULL;
3271 
3272   /* first-time initialization */
3273   hot_team->t.t_parent = root_team;
3274 
3275   /* initialize hot team */
3276   hot_team_max_nth = hot_team->t.t_max_nproc;
3277   for (f = 0; f < hot_team_max_nth; ++f) {
3278     hot_team->t.t_threads[f] = NULL;
3279   }
3280   hot_team->t.t_nproc = 1;
3281   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3282   hot_team->t.t_sched.sched = r_sched.sched;
3283   hot_team->t.t_size_changed = 0;
3284 }
3285 
3286 #ifdef KMP_DEBUG
3287 
3288 typedef struct kmp_team_list_item {
3289   kmp_team_p const *entry;
3290   struct kmp_team_list_item *next;
3291 } kmp_team_list_item_t;
3292 typedef kmp_team_list_item_t *kmp_team_list_t;
3293 
3294 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3295     kmp_team_list_t list, // List of teams.
3296     kmp_team_p const *team // Team to add.
3297     ) {
3298 
3299   // List must terminate with item where both entry and next are NULL.
3300   // Team is added to the list only once.
3301   // List is sorted in ascending order by team id.
3302   // Team id is *not* a key.
3303 
3304   kmp_team_list_t l;
3305 
3306   KMP_DEBUG_ASSERT(list != NULL);
3307   if (team == NULL) {
3308     return;
3309   }
3310 
3311   __kmp_print_structure_team_accum(list, team->t.t_parent);
3312   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3313 
3314   // Search list for the team.
3315   l = list;
3316   while (l->next != NULL && l->entry != team) {
3317     l = l->next;
3318   }
3319   if (l->next != NULL) {
3320     return; // Team has been added before, exit.
3321   }
3322 
3323   // Team is not found. Search list again for insertion point.
3324   l = list;
3325   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3326     l = l->next;
3327   }
3328 
3329   // Insert team.
3330   {
3331     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3332         sizeof(kmp_team_list_item_t));
3333     *item = *l;
3334     l->entry = team;
3335     l->next = item;
3336   }
3337 }
3338 
3339 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3340 
3341                                        ) {
3342   __kmp_printf("%s", title);
3343   if (team != NULL) {
3344     __kmp_printf("%2x %p\n", team->t.t_id, team);
3345   } else {
3346     __kmp_printf(" - (nil)\n");
3347   }
3348 }
3349 
3350 static void __kmp_print_structure_thread(char const *title,
3351                                          kmp_info_p const *thread) {
3352   __kmp_printf("%s", title);
3353   if (thread != NULL) {
3354     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3355   } else {
3356     __kmp_printf(" - (nil)\n");
3357   }
3358 }
3359 
3360 void __kmp_print_structure(void) {
3361 
3362   kmp_team_list_t list;
3363 
3364   // Initialize list of teams.
3365   list =
3366       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3367   list->entry = NULL;
3368   list->next = NULL;
3369 
3370   __kmp_printf("\n------------------------------\nGlobal Thread "
3371                "Table\n------------------------------\n");
3372   {
3373     int gtid;
3374     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3375       __kmp_printf("%2d", gtid);
3376       if (__kmp_threads != NULL) {
3377         __kmp_printf(" %p", __kmp_threads[gtid]);
3378       }
3379       if (__kmp_root != NULL) {
3380         __kmp_printf(" %p", __kmp_root[gtid]);
3381       }
3382       __kmp_printf("\n");
3383     }
3384   }
3385 
3386   // Print out __kmp_threads array.
3387   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3388                "----------\n");
3389   if (__kmp_threads != NULL) {
3390     int gtid;
3391     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3392       kmp_info_t const *thread = __kmp_threads[gtid];
3393       if (thread != NULL) {
3394         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3395         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3396         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3397         __kmp_print_structure_team("    Serial Team:  ",
3398                                    thread->th.th_serial_team);
3399         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3400         __kmp_print_structure_thread("    Master:       ",
3401                                      thread->th.th_team_master);
3402         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3403         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3404 #if OMP_40_ENABLED
3405         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3406 #endif
3407         __kmp_print_structure_thread("    Next in pool: ",
3408                                      thread->th.th_next_pool);
3409         __kmp_printf("\n");
3410         __kmp_print_structure_team_accum(list, thread->th.th_team);
3411         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3412       }
3413     }
3414   } else {
3415     __kmp_printf("Threads array is not allocated.\n");
3416   }
3417 
3418   // Print out __kmp_root array.
3419   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3420                "--------\n");
3421   if (__kmp_root != NULL) {
3422     int gtid;
3423     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3424       kmp_root_t const *root = __kmp_root[gtid];
3425       if (root != NULL) {
3426         __kmp_printf("GTID %2d %p:\n", gtid, root);
3427         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3428         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3429         __kmp_print_structure_thread("    Uber Thread:  ",
3430                                      root->r.r_uber_thread);
3431         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3432         __kmp_printf("    In Parallel:  %2d\n",
3433                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3434         __kmp_printf("\n");
3435         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3436         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3437       }
3438     }
3439   } else {
3440     __kmp_printf("Ubers array is not allocated.\n");
3441   }
3442 
3443   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3444                "--------\n");
3445   while (list->next != NULL) {
3446     kmp_team_p const *team = list->entry;
3447     int i;
3448     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3449     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3450     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3451     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3452     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3453     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3454     for (i = 0; i < team->t.t_nproc; ++i) {
3455       __kmp_printf("    Thread %2d:      ", i);
3456       __kmp_print_structure_thread("", team->t.t_threads[i]);
3457     }
3458     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3459     __kmp_printf("\n");
3460     list = list->next;
3461   }
3462 
3463   // Print out __kmp_thread_pool and __kmp_team_pool.
3464   __kmp_printf("\n------------------------------\nPools\n----------------------"
3465                "--------\n");
3466   __kmp_print_structure_thread("Thread pool:          ",
3467                                CCAST(kmp_info_t *, __kmp_thread_pool));
3468   __kmp_print_structure_team("Team pool:            ",
3469                              CCAST(kmp_team_t *, __kmp_team_pool));
3470   __kmp_printf("\n");
3471 
3472   // Free team list.
3473   while (list != NULL) {
3474     kmp_team_list_item_t *item = list;
3475     list = list->next;
3476     KMP_INTERNAL_FREE(item);
3477   }
3478 }
3479 
3480 #endif
3481 
3482 //---------------------------------------------------------------------------
3483 //  Stuff for per-thread fast random number generator
3484 //  Table of primes
3485 static const unsigned __kmp_primes[] = {
3486     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3487     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3488     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3489     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3490     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3491     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3492     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3493     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3494     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3495     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3496     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3497 
3498 //---------------------------------------------------------------------------
3499 //  __kmp_get_random: Get a random number using a linear congruential method.
3500 unsigned short __kmp_get_random(kmp_info_t *thread) {
3501   unsigned x = thread->th.th_x;
3502   unsigned short r = x >> 16;
3503 
3504   thread->th.th_x = x * thread->th.th_a + 1;
3505 
3506   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3507                 thread->th.th_info.ds.ds_tid, r));
3508 
3509   return r;
3510 }
3511 //--------------------------------------------------------
3512 // __kmp_init_random: Initialize a random number generator
3513 void __kmp_init_random(kmp_info_t *thread) {
3514   unsigned seed = thread->th.th_info.ds.ds_tid;
3515 
3516   thread->th.th_a =
3517       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3518   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3519   KA_TRACE(30,
3520            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3521 }
3522 
3523 #if KMP_OS_WINDOWS
3524 /* reclaim array entries for root threads that are already dead, returns number
3525  * reclaimed */
3526 static int __kmp_reclaim_dead_roots(void) {
3527   int i, r = 0;
3528 
3529   for (i = 0; i < __kmp_threads_capacity; ++i) {
3530     if (KMP_UBER_GTID(i) &&
3531         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3532         !__kmp_root[i]
3533              ->r.r_active) { // AC: reclaim only roots died in non-active state
3534       r += __kmp_unregister_root_other_thread(i);
3535     }
3536   }
3537   return r;
3538 }
3539 #endif
3540 
3541 /* This function attempts to create free entries in __kmp_threads and
3542    __kmp_root, and returns the number of free entries generated.
3543 
3544    For Windows* OS static library, the first mechanism used is to reclaim array
3545    entries for root threads that are already dead.
3546 
3547    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3548    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3549    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3550    threadprivate cache array has been created. Synchronization with
3551    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3552 
3553    After any dead root reclamation, if the clipping value allows array expansion
3554    to result in the generation of a total of nNeed free slots, the function does
3555    that expansion. If not, nothing is done beyond the possible initial root
3556    thread reclamation.
3557 
3558    If any argument is negative, the behavior is undefined. */
3559 static int __kmp_expand_threads(int nNeed) {
3560   int added = 0;
3561   int minimumRequiredCapacity;
3562   int newCapacity;
3563   kmp_info_t **newThreads;
3564   kmp_root_t **newRoot;
3565 
3566 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3567 // resizing __kmp_threads does not need additional protection if foreign
3568 // threads are present
3569 
3570 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3571   /* only for Windows static library */
3572   /* reclaim array entries for root threads that are already dead */
3573   added = __kmp_reclaim_dead_roots();
3574 
3575   if (nNeed) {
3576     nNeed -= added;
3577     if (nNeed < 0)
3578       nNeed = 0;
3579   }
3580 #endif
3581   if (nNeed <= 0)
3582     return added;
3583 
3584   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3585   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3586   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3587   // > __kmp_max_nth in one of two ways:
3588   //
3589   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3590   //    may not be resused by another thread, so we may need to increase
3591   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3592   //
3593   // 2) New foreign root(s) are encountered.  We always register new foreign
3594   //    roots. This may cause a smaller # of threads to be allocated at
3595   //    subsequent parallel regions, but the worker threads hang around (and
3596   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3597   //
3598   // Anyway, that is the reason for moving the check to see if
3599   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3600   // instead of having it performed here. -BB
3601 
3602   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3603 
3604   /* compute expansion headroom to check if we can expand */
3605   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3606     /* possible expansion too small -- give up */
3607     return added;
3608   }
3609   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3610 
3611   newCapacity = __kmp_threads_capacity;
3612   do {
3613     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3614                                                           : __kmp_sys_max_nth;
3615   } while (newCapacity < minimumRequiredCapacity);
3616   newThreads = (kmp_info_t **)__kmp_allocate(
3617       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3618   newRoot =
3619       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3620   KMP_MEMCPY(newThreads, __kmp_threads,
3621              __kmp_threads_capacity * sizeof(kmp_info_t *));
3622   KMP_MEMCPY(newRoot, __kmp_root,
3623              __kmp_threads_capacity * sizeof(kmp_root_t *));
3624 
3625   kmp_info_t **temp_threads = __kmp_threads;
3626   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3627   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3628   __kmp_free(temp_threads);
3629   added += newCapacity - __kmp_threads_capacity;
3630   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3631 
3632   if (newCapacity > __kmp_tp_capacity) {
3633     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3634     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3635       __kmp_threadprivate_resize_cache(newCapacity);
3636     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3637       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3638     }
3639     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3640   }
3641 
3642   return added;
3643 }
3644 
3645 /* Register the current thread as a root thread and obtain our gtid. We must
3646    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3647    thread that calls from __kmp_do_serial_initialize() */
3648 int __kmp_register_root(int initial_thread) {
3649   kmp_info_t *root_thread;
3650   kmp_root_t *root;
3651   int gtid;
3652   int capacity;
3653   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3654   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3655   KMP_MB();
3656 
3657   /* 2007-03-02:
3658      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3659      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3660      work as expected -- it may return false (that means there is at least one
3661      empty slot in __kmp_threads array), but it is possible the only free slot
3662      is #0, which is reserved for initial thread and so cannot be used for this
3663      one. Following code workarounds this bug.
3664 
3665      However, right solution seems to be not reserving slot #0 for initial
3666      thread because:
3667      (1) there is no magic in slot #0,
3668      (2) we cannot detect initial thread reliably (the first thread which does
3669         serial initialization may be not a real initial thread).
3670   */
3671   capacity = __kmp_threads_capacity;
3672   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3673     --capacity;
3674   }
3675 
3676   /* see if there are too many threads */
3677   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3678     if (__kmp_tp_cached) {
3679       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3680                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3681                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3682     } else {
3683       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3684                   __kmp_msg_null);
3685     }
3686   }
3687 
3688   /* find an available thread slot */
3689   /* Don't reassign the zero slot since we need that to only be used by initial
3690      thread */
3691   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3692        gtid++)
3693     ;
3694   KA_TRACE(1,
3695            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3696   KMP_ASSERT(gtid < __kmp_threads_capacity);
3697 
3698   /* update global accounting */
3699   __kmp_all_nth++;
3700   TCW_4(__kmp_nth, __kmp_nth + 1);
3701 
3702   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3703   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3704   if (__kmp_adjust_gtid_mode) {
3705     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3706       if (TCR_4(__kmp_gtid_mode) != 2) {
3707         TCW_4(__kmp_gtid_mode, 2);
3708       }
3709     } else {
3710       if (TCR_4(__kmp_gtid_mode) != 1) {
3711         TCW_4(__kmp_gtid_mode, 1);
3712       }
3713     }
3714   }
3715 
3716 #ifdef KMP_ADJUST_BLOCKTIME
3717   /* Adjust blocktime to zero if necessary            */
3718   /* Middle initialization might not have occurred yet */
3719   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3720     if (__kmp_nth > __kmp_avail_proc) {
3721       __kmp_zero_bt = TRUE;
3722     }
3723   }
3724 #endif /* KMP_ADJUST_BLOCKTIME */
3725 
3726   /* setup this new hierarchy */
3727   if (!(root = __kmp_root[gtid])) {
3728     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3729     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3730   }
3731 
3732 #if KMP_STATS_ENABLED
3733   // Initialize stats as soon as possible (right after gtid assignment).
3734   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3735   __kmp_stats_thread_ptr->startLife();
3736   KMP_SET_THREAD_STATE(SERIAL_REGION);
3737   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3738 #endif
3739   __kmp_initialize_root(root);
3740 
3741   /* setup new root thread structure */
3742   if (root->r.r_uber_thread) {
3743     root_thread = root->r.r_uber_thread;
3744   } else {
3745     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3746     if (__kmp_storage_map) {
3747       __kmp_print_thread_storage_map(root_thread, gtid);
3748     }
3749     root_thread->th.th_info.ds.ds_gtid = gtid;
3750 #if OMPT_SUPPORT
3751     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3752 #endif
3753     root_thread->th.th_root = root;
3754     if (__kmp_env_consistency_check) {
3755       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3756     }
3757 #if USE_FAST_MEMORY
3758     __kmp_initialize_fast_memory(root_thread);
3759 #endif /* USE_FAST_MEMORY */
3760 
3761 #if KMP_USE_BGET
3762     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3763     __kmp_initialize_bget(root_thread);
3764 #endif
3765     __kmp_init_random(root_thread); // Initialize random number generator
3766   }
3767 
3768   /* setup the serial team held in reserve by the root thread */
3769   if (!root_thread->th.th_serial_team) {
3770     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3771     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3772     root_thread->th.th_serial_team =
3773         __kmp_allocate_team(root, 1, 1,
3774 #if OMPT_SUPPORT
3775                             ompt_data_none, // root parallel id
3776 #endif
3777 #if OMP_40_ENABLED
3778                             proc_bind_default,
3779 #endif
3780                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3781   }
3782   KMP_ASSERT(root_thread->th.th_serial_team);
3783   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3784                 root_thread->th.th_serial_team));
3785 
3786   /* drop root_thread into place */
3787   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3788 
3789   root->r.r_root_team->t.t_threads[0] = root_thread;
3790   root->r.r_hot_team->t.t_threads[0] = root_thread;
3791   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3792   // AC: the team created in reserve, not for execution (it is unused for now).
3793   root_thread->th.th_serial_team->t.t_serialized = 0;
3794   root->r.r_uber_thread = root_thread;
3795 
3796   /* initialize the thread, get it ready to go */
3797   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3798   TCW_4(__kmp_init_gtid, TRUE);
3799 
3800   /* prepare the master thread for get_gtid() */
3801   __kmp_gtid_set_specific(gtid);
3802 
3803 #if USE_ITT_BUILD
3804   __kmp_itt_thread_name(gtid);
3805 #endif /* USE_ITT_BUILD */
3806 
3807 #ifdef KMP_TDATA_GTID
3808   __kmp_gtid = gtid;
3809 #endif
3810   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3811   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3812 
3813   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3814                 "plain=%u\n",
3815                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3816                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3817                 KMP_INIT_BARRIER_STATE));
3818   { // Initialize barrier data.
3819     int b;
3820     for (b = 0; b < bs_last_barrier; ++b) {
3821       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3822 #if USE_DEBUGGER
3823       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3824 #endif
3825     }
3826   }
3827   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3828                    KMP_INIT_BARRIER_STATE);
3829 
3830 #if KMP_AFFINITY_SUPPORTED
3831 #if OMP_40_ENABLED
3832   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3833   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3834   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3835   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3836 #endif
3837   if (TCR_4(__kmp_init_middle)) {
3838     __kmp_affinity_set_init_mask(gtid, TRUE);
3839   }
3840 #endif /* KMP_AFFINITY_SUPPORTED */
3841 #if OMP_50_ENABLED
3842   root_thread->th.th_def_allocator = __kmp_def_allocator;
3843   root_thread->th.th_prev_level = 0;
3844   root_thread->th.th_prev_num_threads = 1;
3845 #endif
3846 
3847   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3848   tmp->cg_root = root_thread;
3849   tmp->cg_thread_limit = __kmp_cg_max_nth;
3850   tmp->cg_nthreads = 1;
3851   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3852                  " cg_nthreads init to 1\n",
3853                  root_thread, tmp));
3854   tmp->up = NULL;
3855   root_thread->th.th_cg_roots = tmp;
3856 
3857   __kmp_root_counter++;
3858 
3859 #if OMPT_SUPPORT
3860   if (!initial_thread && ompt_enabled.enabled) {
3861 
3862     kmp_info_t *root_thread = ompt_get_thread();
3863 
3864     ompt_set_thread_state(root_thread, ompt_state_overhead);
3865 
3866     if (ompt_enabled.ompt_callback_thread_begin) {
3867       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3868           ompt_thread_initial, __ompt_get_thread_data_internal());
3869     }
3870     ompt_data_t *task_data;
3871     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3872     if (ompt_enabled.ompt_callback_task_create) {
3873       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3874           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3875       // initial task has nothing to return to
3876     }
3877 
3878     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3879   }
3880 #endif
3881 
3882   KMP_MB();
3883   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3884 
3885   return gtid;
3886 }
3887 
3888 #if KMP_NESTED_HOT_TEAMS
3889 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3890                                 const int max_level) {
3891   int i, n, nth;
3892   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3893   if (!hot_teams || !hot_teams[level].hot_team) {
3894     return 0;
3895   }
3896   KMP_DEBUG_ASSERT(level < max_level);
3897   kmp_team_t *team = hot_teams[level].hot_team;
3898   nth = hot_teams[level].hot_team_nth;
3899   n = nth - 1; // master is not freed
3900   if (level < max_level - 1) {
3901     for (i = 0; i < nth; ++i) {
3902       kmp_info_t *th = team->t.t_threads[i];
3903       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3904       if (i > 0 && th->th.th_hot_teams) {
3905         __kmp_free(th->th.th_hot_teams);
3906         th->th.th_hot_teams = NULL;
3907       }
3908     }
3909   }
3910   __kmp_free_team(root, team, NULL);
3911   return n;
3912 }
3913 #endif
3914 
3915 // Resets a root thread and clear its root and hot teams.
3916 // Returns the number of __kmp_threads entries directly and indirectly freed.
3917 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3918   kmp_team_t *root_team = root->r.r_root_team;
3919   kmp_team_t *hot_team = root->r.r_hot_team;
3920   int n = hot_team->t.t_nproc;
3921   int i;
3922 
3923   KMP_DEBUG_ASSERT(!root->r.r_active);
3924 
3925   root->r.r_root_team = NULL;
3926   root->r.r_hot_team = NULL;
3927   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3928   // before call to __kmp_free_team().
3929   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3930 #if KMP_NESTED_HOT_TEAMS
3931   if (__kmp_hot_teams_max_level >
3932       0) { // need to free nested hot teams and their threads if any
3933     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3934       kmp_info_t *th = hot_team->t.t_threads[i];
3935       if (__kmp_hot_teams_max_level > 1) {
3936         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3937       }
3938       if (th->th.th_hot_teams) {
3939         __kmp_free(th->th.th_hot_teams);
3940         th->th.th_hot_teams = NULL;
3941       }
3942     }
3943   }
3944 #endif
3945   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3946 
3947   // Before we can reap the thread, we need to make certain that all other
3948   // threads in the teams that had this root as ancestor have stopped trying to
3949   // steal tasks.
3950   if (__kmp_tasking_mode != tskm_immediate_exec) {
3951     __kmp_wait_to_unref_task_teams();
3952   }
3953 
3954 #if KMP_OS_WINDOWS
3955   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3956   KA_TRACE(
3957       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3958            "\n",
3959            (LPVOID) & (root->r.r_uber_thread->th),
3960            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3961   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3962 #endif /* KMP_OS_WINDOWS */
3963 
3964 #if OMPT_SUPPORT
3965   if (ompt_enabled.ompt_callback_thread_end) {
3966     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3967         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3968   }
3969 #endif
3970 
3971   TCW_4(__kmp_nth,
3972         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3973   root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3974   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3975                  " to %d\n",
3976                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3977                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3978 
3979   __kmp_reap_thread(root->r.r_uber_thread, 1);
3980 
3981   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3982   // of freeing.
3983   root->r.r_uber_thread = NULL;
3984   /* mark root as no longer in use */
3985   root->r.r_begin = FALSE;
3986 
3987   return n;
3988 }
3989 
3990 void __kmp_unregister_root_current_thread(int gtid) {
3991   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3992   /* this lock should be ok, since unregister_root_current_thread is never
3993      called during an abort, only during a normal close. furthermore, if you
3994      have the forkjoin lock, you should never try to get the initz lock */
3995   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3996   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3997     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3998                   "exiting T#%d\n",
3999                   gtid));
4000     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4001     return;
4002   }
4003   kmp_root_t *root = __kmp_root[gtid];
4004 
4005   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4006   KMP_ASSERT(KMP_UBER_GTID(gtid));
4007   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4008   KMP_ASSERT(root->r.r_active == FALSE);
4009 
4010   KMP_MB();
4011 
4012 #if OMP_45_ENABLED
4013   kmp_info_t *thread = __kmp_threads[gtid];
4014   kmp_team_t *team = thread->th.th_team;
4015   kmp_task_team_t *task_team = thread->th.th_task_team;
4016 
4017   // we need to wait for the proxy tasks before finishing the thread
4018   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4019 #if OMPT_SUPPORT
4020     // the runtime is shutting down so we won't report any events
4021     thread->th.ompt_thread_info.state = ompt_state_undefined;
4022 #endif
4023     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4024   }
4025 #endif
4026 
4027   __kmp_reset_root(gtid, root);
4028 
4029   /* free up this thread slot */
4030   __kmp_gtid_set_specific(KMP_GTID_DNE);
4031 #ifdef KMP_TDATA_GTID
4032   __kmp_gtid = KMP_GTID_DNE;
4033 #endif
4034 
4035   KMP_MB();
4036   KC_TRACE(10,
4037            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4038 
4039   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4040 }
4041 
4042 #if KMP_OS_WINDOWS
4043 /* __kmp_forkjoin_lock must be already held
4044    Unregisters a root thread that is not the current thread.  Returns the number
4045    of __kmp_threads entries freed as a result. */
4046 static int __kmp_unregister_root_other_thread(int gtid) {
4047   kmp_root_t *root = __kmp_root[gtid];
4048   int r;
4049 
4050   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4051   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4052   KMP_ASSERT(KMP_UBER_GTID(gtid));
4053   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4054   KMP_ASSERT(root->r.r_active == FALSE);
4055 
4056   r = __kmp_reset_root(gtid, root);
4057   KC_TRACE(10,
4058            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4059   return r;
4060 }
4061 #endif
4062 
4063 #if KMP_DEBUG
4064 void __kmp_task_info() {
4065 
4066   kmp_int32 gtid = __kmp_entry_gtid();
4067   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4068   kmp_info_t *this_thr = __kmp_threads[gtid];
4069   kmp_team_t *steam = this_thr->th.th_serial_team;
4070   kmp_team_t *team = this_thr->th.th_team;
4071 
4072   __kmp_printf(
4073       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4074       "ptask=%p\n",
4075       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4076       team->t.t_implicit_task_taskdata[tid].td_parent);
4077 }
4078 #endif // KMP_DEBUG
4079 
4080 /* TODO optimize with one big memclr, take out what isn't needed, split
4081    responsibility to workers as much as possible, and delay initialization of
4082    features as much as possible  */
4083 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4084                                   int tid, int gtid) {
4085   /* this_thr->th.th_info.ds.ds_gtid is setup in
4086      kmp_allocate_thread/create_worker.
4087      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4088   kmp_info_t *master = team->t.t_threads[0];
4089   KMP_DEBUG_ASSERT(this_thr != NULL);
4090   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4091   KMP_DEBUG_ASSERT(team);
4092   KMP_DEBUG_ASSERT(team->t.t_threads);
4093   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4094   KMP_DEBUG_ASSERT(master);
4095   KMP_DEBUG_ASSERT(master->th.th_root);
4096 
4097   KMP_MB();
4098 
4099   TCW_SYNC_PTR(this_thr->th.th_team, team);
4100 
4101   this_thr->th.th_info.ds.ds_tid = tid;
4102   this_thr->th.th_set_nproc = 0;
4103   if (__kmp_tasking_mode != tskm_immediate_exec)
4104     // When tasking is possible, threads are not safe to reap until they are
4105     // done tasking; this will be set when tasking code is exited in wait
4106     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4107   else // no tasking --> always safe to reap
4108     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4109 #if OMP_40_ENABLED
4110   this_thr->th.th_set_proc_bind = proc_bind_default;
4111 #if KMP_AFFINITY_SUPPORTED
4112   this_thr->th.th_new_place = this_thr->th.th_current_place;
4113 #endif
4114 #endif
4115   this_thr->th.th_root = master->th.th_root;
4116 
4117   /* setup the thread's cache of the team structure */
4118   this_thr->th.th_team_nproc = team->t.t_nproc;
4119   this_thr->th.th_team_master = master;
4120   this_thr->th.th_team_serialized = team->t.t_serialized;
4121   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4122 
4123   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4124 
4125   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4126                 tid, gtid, this_thr, this_thr->th.th_current_task));
4127 
4128   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4129                            team, tid, TRUE);
4130 
4131   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4132                 tid, gtid, this_thr, this_thr->th.th_current_task));
4133   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4134   // __kmp_initialize_team()?
4135 
4136   /* TODO no worksharing in speculative threads */
4137   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4138 
4139   this_thr->th.th_local.this_construct = 0;
4140 
4141   if (!this_thr->th.th_pri_common) {
4142     this_thr->th.th_pri_common =
4143         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4144     if (__kmp_storage_map) {
4145       __kmp_print_storage_map_gtid(
4146           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4147           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4148     }
4149     this_thr->th.th_pri_head = NULL;
4150   }
4151 
4152   if (this_thr != master && // Master's CG root is initialized elsewhere
4153       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4154     // Make new thread's CG root same as master's
4155     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4156     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4157     // Increment new thread's CG root's counter to add the new thread
4158     this_thr->th.th_cg_roots->cg_nthreads++;
4159     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4160                    " node %p of thread %p to %d\n",
4161                    this_thr, this_thr->th.th_cg_roots,
4162                    this_thr->th.th_cg_roots->cg_root,
4163                    this_thr->th.th_cg_roots->cg_nthreads));
4164     this_thr->th.th_current_task->td_icvs.thread_limit =
4165         this_thr->th.th_cg_roots->cg_thread_limit;
4166   }
4167 
4168   /* Initialize dynamic dispatch */
4169   {
4170     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4171     // Use team max_nproc since this will never change for the team.
4172     size_t disp_size =
4173         sizeof(dispatch_private_info_t) *
4174         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4175     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4176                   team->t.t_max_nproc));
4177     KMP_ASSERT(dispatch);
4178     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4179     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4180 
4181     dispatch->th_disp_index = 0;
4182 #if OMP_45_ENABLED
4183     dispatch->th_doacross_buf_idx = 0;
4184 #endif
4185     if (!dispatch->th_disp_buffer) {
4186       dispatch->th_disp_buffer =
4187           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4188 
4189       if (__kmp_storage_map) {
4190         __kmp_print_storage_map_gtid(
4191             gtid, &dispatch->th_disp_buffer[0],
4192             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4193                                           ? 1
4194                                           : __kmp_dispatch_num_buffers],
4195             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4196                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4197             gtid, team->t.t_id, gtid);
4198       }
4199     } else {
4200       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4201     }
4202 
4203     dispatch->th_dispatch_pr_current = 0;
4204     dispatch->th_dispatch_sh_current = 0;
4205 
4206     dispatch->th_deo_fcn = 0; /* ORDERED     */
4207     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4208   }
4209 
4210   this_thr->th.th_next_pool = NULL;
4211 
4212   if (!this_thr->th.th_task_state_memo_stack) {
4213     size_t i;
4214     this_thr->th.th_task_state_memo_stack =
4215         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4216     this_thr->th.th_task_state_top = 0;
4217     this_thr->th.th_task_state_stack_sz = 4;
4218     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4219          ++i) // zero init the stack
4220       this_thr->th.th_task_state_memo_stack[i] = 0;
4221   }
4222 
4223   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4224   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4225 
4226   KMP_MB();
4227 }
4228 
4229 /* allocate a new thread for the requesting team. this is only called from
4230    within a forkjoin critical section. we will first try to get an available
4231    thread from the thread pool. if none is available, we will fork a new one
4232    assuming we are able to create a new one. this should be assured, as the
4233    caller should check on this first. */
4234 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4235                                   int new_tid) {
4236   kmp_team_t *serial_team;
4237   kmp_info_t *new_thr;
4238   int new_gtid;
4239 
4240   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4241   KMP_DEBUG_ASSERT(root && team);
4242 #if !KMP_NESTED_HOT_TEAMS
4243   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4244 #endif
4245   KMP_MB();
4246 
4247   /* first, try to get one from the thread pool */
4248   if (__kmp_thread_pool) {
4249     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4250     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4251     if (new_thr == __kmp_thread_pool_insert_pt) {
4252       __kmp_thread_pool_insert_pt = NULL;
4253     }
4254     TCW_4(new_thr->th.th_in_pool, FALSE);
4255     // Don't touch th_active_in_pool or th_active.
4256     // The worker thread adjusts those flags as it sleeps/awakens.
4257     __kmp_thread_pool_nth--;
4258 
4259     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4260                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4261     KMP_ASSERT(!new_thr->th.th_team);
4262     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4263     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4264 
4265     /* setup the thread structure */
4266     __kmp_initialize_info(new_thr, team, new_tid,
4267                           new_thr->th.th_info.ds.ds_gtid);
4268     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4269 
4270     TCW_4(__kmp_nth, __kmp_nth + 1);
4271 
4272     new_thr->th.th_task_state = 0;
4273     new_thr->th.th_task_state_top = 0;
4274     new_thr->th.th_task_state_stack_sz = 4;
4275 
4276 #ifdef KMP_ADJUST_BLOCKTIME
4277     /* Adjust blocktime back to zero if necessary */
4278     /* Middle initialization might not have occurred yet */
4279     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4280       if (__kmp_nth > __kmp_avail_proc) {
4281         __kmp_zero_bt = TRUE;
4282       }
4283     }
4284 #endif /* KMP_ADJUST_BLOCKTIME */
4285 
4286 #if KMP_DEBUG
4287     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4288     // KMP_BARRIER_PARENT_FLAG.
4289     int b;
4290     kmp_balign_t *balign = new_thr->th.th_bar;
4291     for (b = 0; b < bs_last_barrier; ++b)
4292       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4293 #endif
4294 
4295     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4296                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4297 
4298     KMP_MB();
4299     return new_thr;
4300   }
4301 
4302   /* no, well fork a new one */
4303   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4304   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4305 
4306 #if KMP_USE_MONITOR
4307   // If this is the first worker thread the RTL is creating, then also
4308   // launch the monitor thread.  We try to do this as early as possible.
4309   if (!TCR_4(__kmp_init_monitor)) {
4310     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4311     if (!TCR_4(__kmp_init_monitor)) {
4312       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4313       TCW_4(__kmp_init_monitor, 1);
4314       __kmp_create_monitor(&__kmp_monitor);
4315       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4316 #if KMP_OS_WINDOWS
4317       // AC: wait until monitor has started. This is a fix for CQ232808.
4318       // The reason is that if the library is loaded/unloaded in a loop with
4319       // small (parallel) work in between, then there is high probability that
4320       // monitor thread started after the library shutdown. At shutdown it is
4321       // too late to cope with the problem, because when the master is in
4322       // DllMain (process detach) the monitor has no chances to start (it is
4323       // blocked), and master has no means to inform the monitor that the
4324       // library has gone, because all the memory which the monitor can access
4325       // is going to be released/reset.
4326       while (TCR_4(__kmp_init_monitor) < 2) {
4327         KMP_YIELD(TRUE);
4328       }
4329       KF_TRACE(10, ("after monitor thread has started\n"));
4330 #endif
4331     }
4332     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4333   }
4334 #endif
4335 
4336   KMP_MB();
4337   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4338     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4339   }
4340 
4341   /* allocate space for it. */
4342   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4343 
4344   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4345 
4346   if (__kmp_storage_map) {
4347     __kmp_print_thread_storage_map(new_thr, new_gtid);
4348   }
4349 
4350   // add the reserve serialized team, initialized from the team's master thread
4351   {
4352     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4353     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4354     new_thr->th.th_serial_team = serial_team =
4355         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4356 #if OMPT_SUPPORT
4357                                           ompt_data_none, // root parallel id
4358 #endif
4359 #if OMP_40_ENABLED
4360                                           proc_bind_default,
4361 #endif
4362                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4363   }
4364   KMP_ASSERT(serial_team);
4365   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4366   // execution (it is unused for now).
4367   serial_team->t.t_threads[0] = new_thr;
4368   KF_TRACE(10,
4369            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4370             new_thr));
4371 
4372   /* setup the thread structures */
4373   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4374 
4375 #if USE_FAST_MEMORY
4376   __kmp_initialize_fast_memory(new_thr);
4377 #endif /* USE_FAST_MEMORY */
4378 
4379 #if KMP_USE_BGET
4380   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4381   __kmp_initialize_bget(new_thr);
4382 #endif
4383 
4384   __kmp_init_random(new_thr); // Initialize random number generator
4385 
4386   /* Initialize these only once when thread is grabbed for a team allocation */
4387   KA_TRACE(20,
4388            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4389             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4390 
4391   int b;
4392   kmp_balign_t *balign = new_thr->th.th_bar;
4393   for (b = 0; b < bs_last_barrier; ++b) {
4394     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4395     balign[b].bb.team = NULL;
4396     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4397     balign[b].bb.use_oncore_barrier = 0;
4398   }
4399 
4400   new_thr->th.th_spin_here = FALSE;
4401   new_thr->th.th_next_waiting = 0;
4402 #if KMP_OS_UNIX
4403   new_thr->th.th_blocking = false;
4404 #endif
4405 
4406 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4407   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4408   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4409   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4410   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4411 #endif
4412 #if OMP_50_ENABLED
4413   new_thr->th.th_def_allocator = __kmp_def_allocator;
4414   new_thr->th.th_prev_level = 0;
4415   new_thr->th.th_prev_num_threads = 1;
4416 #endif
4417 
4418   TCW_4(new_thr->th.th_in_pool, FALSE);
4419   new_thr->th.th_active_in_pool = FALSE;
4420   TCW_4(new_thr->th.th_active, TRUE);
4421 
4422   /* adjust the global counters */
4423   __kmp_all_nth++;
4424   __kmp_nth++;
4425 
4426   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4427   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4428   if (__kmp_adjust_gtid_mode) {
4429     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4430       if (TCR_4(__kmp_gtid_mode) != 2) {
4431         TCW_4(__kmp_gtid_mode, 2);
4432       }
4433     } else {
4434       if (TCR_4(__kmp_gtid_mode) != 1) {
4435         TCW_4(__kmp_gtid_mode, 1);
4436       }
4437     }
4438   }
4439 
4440 #ifdef KMP_ADJUST_BLOCKTIME
4441   /* Adjust blocktime back to zero if necessary       */
4442   /* Middle initialization might not have occurred yet */
4443   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4444     if (__kmp_nth > __kmp_avail_proc) {
4445       __kmp_zero_bt = TRUE;
4446     }
4447   }
4448 #endif /* KMP_ADJUST_BLOCKTIME */
4449 
4450   /* actually fork it and create the new worker thread */
4451   KF_TRACE(
4452       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4453   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4454   KF_TRACE(10,
4455            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4456 
4457   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4458                 new_gtid));
4459   KMP_MB();
4460   return new_thr;
4461 }
4462 
4463 /* Reinitialize team for reuse.
4464    The hot team code calls this case at every fork barrier, so EPCC barrier
4465    test are extremely sensitive to changes in it, esp. writes to the team
4466    struct, which cause a cache invalidation in all threads.
4467    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4468 static void __kmp_reinitialize_team(kmp_team_t *team,
4469                                     kmp_internal_control_t *new_icvs,
4470                                     ident_t *loc) {
4471   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4472                 team->t.t_threads[0], team));
4473   KMP_DEBUG_ASSERT(team && new_icvs);
4474   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4475   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4476 
4477   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4478   // Copy ICVs to the master thread's implicit taskdata
4479   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4480   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4481 
4482   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4483                 team->t.t_threads[0], team));
4484 }
4485 
4486 /* Initialize the team data structure.
4487    This assumes the t_threads and t_max_nproc are already set.
4488    Also, we don't touch the arguments */
4489 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4490                                   kmp_internal_control_t *new_icvs,
4491                                   ident_t *loc) {
4492   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4493 
4494   /* verify */
4495   KMP_DEBUG_ASSERT(team);
4496   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4497   KMP_DEBUG_ASSERT(team->t.t_threads);
4498   KMP_MB();
4499 
4500   team->t.t_master_tid = 0; /* not needed */
4501   /* team->t.t_master_bar;        not needed */
4502   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4503   team->t.t_nproc = new_nproc;
4504 
4505   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4506   team->t.t_next_pool = NULL;
4507   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4508    * up hot team */
4509 
4510   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4511   team->t.t_invoke = NULL; /* not needed */
4512 
4513   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4514   team->t.t_sched.sched = new_icvs->sched.sched;
4515 
4516 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4517   team->t.t_fp_control_saved = FALSE; /* not needed */
4518   team->t.t_x87_fpu_control_word = 0; /* not needed */
4519   team->t.t_mxcsr = 0; /* not needed */
4520 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4521 
4522   team->t.t_construct = 0;
4523 
4524   team->t.t_ordered.dt.t_value = 0;
4525   team->t.t_master_active = FALSE;
4526 
4527   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4528 
4529 #ifdef KMP_DEBUG
4530   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4531 #endif
4532 #if KMP_OS_WINDOWS
4533   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4534 #endif
4535 
4536   team->t.t_control_stack_top = NULL;
4537 
4538   __kmp_reinitialize_team(team, new_icvs, loc);
4539 
4540   KMP_MB();
4541   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4542 }
4543 
4544 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4545 /* Sets full mask for thread and returns old mask, no changes to structures. */
4546 static void
4547 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4548   if (KMP_AFFINITY_CAPABLE()) {
4549     int status;
4550     if (old_mask != NULL) {
4551       status = __kmp_get_system_affinity(old_mask, TRUE);
4552       int error = errno;
4553       if (status != 0) {
4554         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4555                     __kmp_msg_null);
4556       }
4557     }
4558     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4559   }
4560 }
4561 #endif
4562 
4563 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4564 
4565 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4566 // It calculats the worker + master thread's partition based upon the parent
4567 // thread's partition, and binds each worker to a thread in their partition.
4568 // The master thread's partition should already include its current binding.
4569 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4570   // Copy the master thread's place partion to the team struct
4571   kmp_info_t *master_th = team->t.t_threads[0];
4572   KMP_DEBUG_ASSERT(master_th != NULL);
4573   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4574   int first_place = master_th->th.th_first_place;
4575   int last_place = master_th->th.th_last_place;
4576   int masters_place = master_th->th.th_current_place;
4577   team->t.t_first_place = first_place;
4578   team->t.t_last_place = last_place;
4579 
4580   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4581                 "bound to place %d partition = [%d,%d]\n",
4582                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4583                 team->t.t_id, masters_place, first_place, last_place));
4584 
4585   switch (proc_bind) {
4586 
4587   case proc_bind_default:
4588     // serial teams might have the proc_bind policy set to proc_bind_default. It
4589     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4590     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4591     break;
4592 
4593   case proc_bind_master: {
4594     int f;
4595     int n_th = team->t.t_nproc;
4596     for (f = 1; f < n_th; f++) {
4597       kmp_info_t *th = team->t.t_threads[f];
4598       KMP_DEBUG_ASSERT(th != NULL);
4599       th->th.th_first_place = first_place;
4600       th->th.th_last_place = last_place;
4601       th->th.th_new_place = masters_place;
4602 #if OMP_50_ENABLED
4603       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4604           team->t.t_display_affinity != 1) {
4605         team->t.t_display_affinity = 1;
4606       }
4607 #endif
4608 
4609       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4610                      "partition = [%d,%d]\n",
4611                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4612                      f, masters_place, first_place, last_place));
4613     }
4614   } break;
4615 
4616   case proc_bind_close: {
4617     int f;
4618     int n_th = team->t.t_nproc;
4619     int n_places;
4620     if (first_place <= last_place) {
4621       n_places = last_place - first_place + 1;
4622     } else {
4623       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4624     }
4625     if (n_th <= n_places) {
4626       int place = masters_place;
4627       for (f = 1; f < n_th; f++) {
4628         kmp_info_t *th = team->t.t_threads[f];
4629         KMP_DEBUG_ASSERT(th != NULL);
4630 
4631         if (place == last_place) {
4632           place = first_place;
4633         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4634           place = 0;
4635         } else {
4636           place++;
4637         }
4638         th->th.th_first_place = first_place;
4639         th->th.th_last_place = last_place;
4640         th->th.th_new_place = place;
4641 #if OMP_50_ENABLED
4642         if (__kmp_display_affinity && place != th->th.th_current_place &&
4643             team->t.t_display_affinity != 1) {
4644           team->t.t_display_affinity = 1;
4645         }
4646 #endif
4647 
4648         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4649                        "partition = [%d,%d]\n",
4650                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4651                        team->t.t_id, f, place, first_place, last_place));
4652       }
4653     } else {
4654       int S, rem, gap, s_count;
4655       S = n_th / n_places;
4656       s_count = 0;
4657       rem = n_th - (S * n_places);
4658       gap = rem > 0 ? n_places / rem : n_places;
4659       int place = masters_place;
4660       int gap_ct = gap;
4661       for (f = 0; f < n_th; f++) {
4662         kmp_info_t *th = team->t.t_threads[f];
4663         KMP_DEBUG_ASSERT(th != NULL);
4664 
4665         th->th.th_first_place = first_place;
4666         th->th.th_last_place = last_place;
4667         th->th.th_new_place = place;
4668 #if OMP_50_ENABLED
4669         if (__kmp_display_affinity && place != th->th.th_current_place &&
4670             team->t.t_display_affinity != 1) {
4671           team->t.t_display_affinity = 1;
4672         }
4673 #endif
4674         s_count++;
4675 
4676         if ((s_count == S) && rem && (gap_ct == gap)) {
4677           // do nothing, add an extra thread to place on next iteration
4678         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4679           // we added an extra thread to this place; move to next place
4680           if (place == last_place) {
4681             place = first_place;
4682           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4683             place = 0;
4684           } else {
4685             place++;
4686           }
4687           s_count = 0;
4688           gap_ct = 1;
4689           rem--;
4690         } else if (s_count == S) { // place full; don't add extra
4691           if (place == last_place) {
4692             place = first_place;
4693           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4694             place = 0;
4695           } else {
4696             place++;
4697           }
4698           gap_ct++;
4699           s_count = 0;
4700         }
4701 
4702         KA_TRACE(100,
4703                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4704                   "partition = [%d,%d]\n",
4705                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4706                   th->th.th_new_place, first_place, last_place));
4707       }
4708       KMP_DEBUG_ASSERT(place == masters_place);
4709     }
4710   } break;
4711 
4712   case proc_bind_spread: {
4713     int f;
4714     int n_th = team->t.t_nproc;
4715     int n_places;
4716     int thidx;
4717     if (first_place <= last_place) {
4718       n_places = last_place - first_place + 1;
4719     } else {
4720       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4721     }
4722     if (n_th <= n_places) {
4723       int place = -1;
4724 
4725       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4726         int S = n_places / n_th;
4727         int s_count, rem, gap, gap_ct;
4728 
4729         place = masters_place;
4730         rem = n_places - n_th * S;
4731         gap = rem ? n_th / rem : 1;
4732         gap_ct = gap;
4733         thidx = n_th;
4734         if (update_master_only == 1)
4735           thidx = 1;
4736         for (f = 0; f < thidx; f++) {
4737           kmp_info_t *th = team->t.t_threads[f];
4738           KMP_DEBUG_ASSERT(th != NULL);
4739 
4740           th->th.th_first_place = place;
4741           th->th.th_new_place = place;
4742 #if OMP_50_ENABLED
4743           if (__kmp_display_affinity && place != th->th.th_current_place &&
4744               team->t.t_display_affinity != 1) {
4745             team->t.t_display_affinity = 1;
4746           }
4747 #endif
4748           s_count = 1;
4749           while (s_count < S) {
4750             if (place == last_place) {
4751               place = first_place;
4752             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4753               place = 0;
4754             } else {
4755               place++;
4756             }
4757             s_count++;
4758           }
4759           if (rem && (gap_ct == gap)) {
4760             if (place == last_place) {
4761               place = first_place;
4762             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4763               place = 0;
4764             } else {
4765               place++;
4766             }
4767             rem--;
4768             gap_ct = 0;
4769           }
4770           th->th.th_last_place = place;
4771           gap_ct++;
4772 
4773           if (place == last_place) {
4774             place = first_place;
4775           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4776             place = 0;
4777           } else {
4778             place++;
4779           }
4780 
4781           KA_TRACE(100,
4782                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4783                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4784                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4785                     f, th->th.th_new_place, th->th.th_first_place,
4786                     th->th.th_last_place, __kmp_affinity_num_masks));
4787         }
4788       } else {
4789         /* Having uniform space of available computation places I can create
4790            T partitions of round(P/T) size and put threads into the first
4791            place of each partition. */
4792         double current = static_cast<double>(masters_place);
4793         double spacing =
4794             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4795         int first, last;
4796         kmp_info_t *th;
4797 
4798         thidx = n_th + 1;
4799         if (update_master_only == 1)
4800           thidx = 1;
4801         for (f = 0; f < thidx; f++) {
4802           first = static_cast<int>(current);
4803           last = static_cast<int>(current + spacing) - 1;
4804           KMP_DEBUG_ASSERT(last >= first);
4805           if (first >= n_places) {
4806             if (masters_place) {
4807               first -= n_places;
4808               last -= n_places;
4809               if (first == (masters_place + 1)) {
4810                 KMP_DEBUG_ASSERT(f == n_th);
4811                 first--;
4812               }
4813               if (last == masters_place) {
4814                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4815                 last--;
4816               }
4817             } else {
4818               KMP_DEBUG_ASSERT(f == n_th);
4819               first = 0;
4820               last = 0;
4821             }
4822           }
4823           if (last >= n_places) {
4824             last = (n_places - 1);
4825           }
4826           place = first;
4827           current += spacing;
4828           if (f < n_th) {
4829             KMP_DEBUG_ASSERT(0 <= first);
4830             KMP_DEBUG_ASSERT(n_places > first);
4831             KMP_DEBUG_ASSERT(0 <= last);
4832             KMP_DEBUG_ASSERT(n_places > last);
4833             KMP_DEBUG_ASSERT(last_place >= first_place);
4834             th = team->t.t_threads[f];
4835             KMP_DEBUG_ASSERT(th);
4836             th->th.th_first_place = first;
4837             th->th.th_new_place = place;
4838             th->th.th_last_place = last;
4839 #if OMP_50_ENABLED
4840             if (__kmp_display_affinity && place != th->th.th_current_place &&
4841                 team->t.t_display_affinity != 1) {
4842               team->t.t_display_affinity = 1;
4843             }
4844 #endif
4845             KA_TRACE(100,
4846                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4847                       "partition = [%d,%d], spacing = %.4f\n",
4848                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4849                       team->t.t_id, f, th->th.th_new_place,
4850                       th->th.th_first_place, th->th.th_last_place, spacing));
4851           }
4852         }
4853       }
4854       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4855     } else {
4856       int S, rem, gap, s_count;
4857       S = n_th / n_places;
4858       s_count = 0;
4859       rem = n_th - (S * n_places);
4860       gap = rem > 0 ? n_places / rem : n_places;
4861       int place = masters_place;
4862       int gap_ct = gap;
4863       thidx = n_th;
4864       if (update_master_only == 1)
4865         thidx = 1;
4866       for (f = 0; f < thidx; f++) {
4867         kmp_info_t *th = team->t.t_threads[f];
4868         KMP_DEBUG_ASSERT(th != NULL);
4869 
4870         th->th.th_first_place = place;
4871         th->th.th_last_place = place;
4872         th->th.th_new_place = place;
4873 #if OMP_50_ENABLED
4874         if (__kmp_display_affinity && place != th->th.th_current_place &&
4875             team->t.t_display_affinity != 1) {
4876           team->t.t_display_affinity = 1;
4877         }
4878 #endif
4879         s_count++;
4880 
4881         if ((s_count == S) && rem && (gap_ct == gap)) {
4882           // do nothing, add an extra thread to place on next iteration
4883         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4884           // we added an extra thread to this place; move on to next place
4885           if (place == last_place) {
4886             place = first_place;
4887           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4888             place = 0;
4889           } else {
4890             place++;
4891           }
4892           s_count = 0;
4893           gap_ct = 1;
4894           rem--;
4895         } else if (s_count == S) { // place is full; don't add extra thread
4896           if (place == last_place) {
4897             place = first_place;
4898           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4899             place = 0;
4900           } else {
4901             place++;
4902           }
4903           gap_ct++;
4904           s_count = 0;
4905         }
4906 
4907         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4908                        "partition = [%d,%d]\n",
4909                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4910                        team->t.t_id, f, th->th.th_new_place,
4911                        th->th.th_first_place, th->th.th_last_place));
4912       }
4913       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4914     }
4915   } break;
4916 
4917   default:
4918     break;
4919   }
4920 
4921   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4922 }
4923 
4924 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4925 
4926 /* allocate a new team data structure to use.  take one off of the free pool if
4927    available */
4928 kmp_team_t *
4929 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4930 #if OMPT_SUPPORT
4931                     ompt_data_t ompt_parallel_data,
4932 #endif
4933 #if OMP_40_ENABLED
4934                     kmp_proc_bind_t new_proc_bind,
4935 #endif
4936                     kmp_internal_control_t *new_icvs,
4937                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4938   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4939   int f;
4940   kmp_team_t *team;
4941   int use_hot_team = !root->r.r_active;
4942   int level = 0;
4943 
4944   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4945   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4946   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4947   KMP_MB();
4948 
4949 #if KMP_NESTED_HOT_TEAMS
4950   kmp_hot_team_ptr_t *hot_teams;
4951   if (master) {
4952     team = master->th.th_team;
4953     level = team->t.t_active_level;
4954     if (master->th.th_teams_microtask) { // in teams construct?
4955       if (master->th.th_teams_size.nteams > 1 &&
4956           ( // #teams > 1
4957               team->t.t_pkfn ==
4958                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4959               master->th.th_teams_level <
4960                   team->t.t_level)) { // or nested parallel inside the teams
4961         ++level; // not increment if #teams==1, or for outer fork of the teams;
4962         // increment otherwise
4963       }
4964     }
4965     hot_teams = master->th.th_hot_teams;
4966     if (level < __kmp_hot_teams_max_level && hot_teams &&
4967         hot_teams[level]
4968             .hot_team) { // hot team has already been allocated for given level
4969       use_hot_team = 1;
4970     } else {
4971       use_hot_team = 0;
4972     }
4973   }
4974 #endif
4975   // Optimization to use a "hot" team
4976   if (use_hot_team && new_nproc > 1) {
4977     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4978 #if KMP_NESTED_HOT_TEAMS
4979     team = hot_teams[level].hot_team;
4980 #else
4981     team = root->r.r_hot_team;
4982 #endif
4983 #if KMP_DEBUG
4984     if (__kmp_tasking_mode != tskm_immediate_exec) {
4985       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4986                     "task_team[1] = %p before reinit\n",
4987                     team->t.t_task_team[0], team->t.t_task_team[1]));
4988     }
4989 #endif
4990 
4991     // Has the number of threads changed?
4992     /* Let's assume the most common case is that the number of threads is
4993        unchanged, and put that case first. */
4994     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4995       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4996       // This case can mean that omp_set_num_threads() was called and the hot
4997       // team size was already reduced, so we check the special flag
4998       if (team->t.t_size_changed == -1) {
4999         team->t.t_size_changed = 1;
5000       } else {
5001         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5002       }
5003 
5004       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5005       kmp_r_sched_t new_sched = new_icvs->sched;
5006       // set master's schedule as new run-time schedule
5007       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5008 
5009       __kmp_reinitialize_team(team, new_icvs,
5010                               root->r.r_uber_thread->th.th_ident);
5011 
5012       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5013                     team->t.t_threads[0], team));
5014       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5015 
5016 #if OMP_40_ENABLED
5017 #if KMP_AFFINITY_SUPPORTED
5018       if ((team->t.t_size_changed == 0) &&
5019           (team->t.t_proc_bind == new_proc_bind)) {
5020         if (new_proc_bind == proc_bind_spread) {
5021           __kmp_partition_places(
5022               team, 1); // add flag to update only master for spread
5023         }
5024         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5025                        "proc_bind = %d, partition = [%d,%d]\n",
5026                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5027                        team->t.t_last_place));
5028       } else {
5029         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5030         __kmp_partition_places(team);
5031       }
5032 #else
5033       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5034 #endif /* KMP_AFFINITY_SUPPORTED */
5035 #endif /* OMP_40_ENABLED */
5036     } else if (team->t.t_nproc > new_nproc) {
5037       KA_TRACE(20,
5038                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5039                 new_nproc));
5040 
5041       team->t.t_size_changed = 1;
5042 #if KMP_NESTED_HOT_TEAMS
5043       if (__kmp_hot_teams_mode == 0) {
5044         // AC: saved number of threads should correspond to team's value in this
5045         // mode, can be bigger in mode 1, when hot team has threads in reserve
5046         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5047         hot_teams[level].hot_team_nth = new_nproc;
5048 #endif // KMP_NESTED_HOT_TEAMS
5049         /* release the extra threads we don't need any more */
5050         for (f = new_nproc; f < team->t.t_nproc; f++) {
5051           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5052           if (__kmp_tasking_mode != tskm_immediate_exec) {
5053             // When decreasing team size, threads no longer in the team should
5054             // unref task team.
5055             team->t.t_threads[f]->th.th_task_team = NULL;
5056           }
5057           __kmp_free_thread(team->t.t_threads[f]);
5058           team->t.t_threads[f] = NULL;
5059         }
5060 #if KMP_NESTED_HOT_TEAMS
5061       } // (__kmp_hot_teams_mode == 0)
5062       else {
5063         // When keeping extra threads in team, switch threads to wait on own
5064         // b_go flag
5065         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5066           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5067           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5068           for (int b = 0; b < bs_last_barrier; ++b) {
5069             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5070               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5071             }
5072             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5073           }
5074         }
5075       }
5076 #endif // KMP_NESTED_HOT_TEAMS
5077       team->t.t_nproc = new_nproc;
5078       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5079       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5080       __kmp_reinitialize_team(team, new_icvs,
5081                               root->r.r_uber_thread->th.th_ident);
5082 
5083       // Update remaining threads
5084       for (f = 0; f < new_nproc; ++f) {
5085         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5086       }
5087 
5088       // restore the current task state of the master thread: should be the
5089       // implicit task
5090       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5091                     team->t.t_threads[0], team));
5092 
5093       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5094 
5095 #ifdef KMP_DEBUG
5096       for (f = 0; f < team->t.t_nproc; f++) {
5097         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5098                          team->t.t_threads[f]->th.th_team_nproc ==
5099                              team->t.t_nproc);
5100       }
5101 #endif
5102 
5103 #if OMP_40_ENABLED
5104       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5105 #if KMP_AFFINITY_SUPPORTED
5106       __kmp_partition_places(team);
5107 #endif
5108 #endif
5109     } else { // team->t.t_nproc < new_nproc
5110 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5111       kmp_affin_mask_t *old_mask;
5112       if (KMP_AFFINITY_CAPABLE()) {
5113         KMP_CPU_ALLOC(old_mask);
5114       }
5115 #endif
5116 
5117       KA_TRACE(20,
5118                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5119                 new_nproc));
5120 
5121       team->t.t_size_changed = 1;
5122 
5123 #if KMP_NESTED_HOT_TEAMS
5124       int avail_threads = hot_teams[level].hot_team_nth;
5125       if (new_nproc < avail_threads)
5126         avail_threads = new_nproc;
5127       kmp_info_t **other_threads = team->t.t_threads;
5128       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5129         // Adjust barrier data of reserved threads (if any) of the team
5130         // Other data will be set in __kmp_initialize_info() below.
5131         int b;
5132         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5133         for (b = 0; b < bs_last_barrier; ++b) {
5134           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5135           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5136 #if USE_DEBUGGER
5137           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5138 #endif
5139         }
5140       }
5141       if (hot_teams[level].hot_team_nth >= new_nproc) {
5142         // we have all needed threads in reserve, no need to allocate any
5143         // this only possible in mode 1, cannot have reserved threads in mode 0
5144         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5145         team->t.t_nproc = new_nproc; // just get reserved threads involved
5146       } else {
5147         // we may have some threads in reserve, but not enough
5148         team->t.t_nproc =
5149             hot_teams[level]
5150                 .hot_team_nth; // get reserved threads involved if any
5151         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5152 #endif // KMP_NESTED_HOT_TEAMS
5153         if (team->t.t_max_nproc < new_nproc) {
5154           /* reallocate larger arrays */
5155           __kmp_reallocate_team_arrays(team, new_nproc);
5156           __kmp_reinitialize_team(team, new_icvs, NULL);
5157         }
5158 
5159 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5160         /* Temporarily set full mask for master thread before creation of
5161            workers. The reason is that workers inherit the affinity from master,
5162            so if a lot of workers are created on the single core quickly, they
5163            don't get a chance to set their own affinity for a long time. */
5164         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5165 #endif
5166 
5167         /* allocate new threads for the hot team */
5168         for (f = team->t.t_nproc; f < new_nproc; f++) {
5169           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5170           KMP_DEBUG_ASSERT(new_worker);
5171           team->t.t_threads[f] = new_worker;
5172 
5173           KA_TRACE(20,
5174                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5175                     "join=%llu, plain=%llu\n",
5176                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5177                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5178                     team->t.t_bar[bs_plain_barrier].b_arrived));
5179 
5180           { // Initialize barrier data for new threads.
5181             int b;
5182             kmp_balign_t *balign = new_worker->th.th_bar;
5183             for (b = 0; b < bs_last_barrier; ++b) {
5184               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5185               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5186                                KMP_BARRIER_PARENT_FLAG);
5187 #if USE_DEBUGGER
5188               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5189 #endif
5190             }
5191           }
5192         }
5193 
5194 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5195         if (KMP_AFFINITY_CAPABLE()) {
5196           /* Restore initial master thread's affinity mask */
5197           __kmp_set_system_affinity(old_mask, TRUE);
5198           KMP_CPU_FREE(old_mask);
5199         }
5200 #endif
5201 #if KMP_NESTED_HOT_TEAMS
5202       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5203 #endif // KMP_NESTED_HOT_TEAMS
5204       /* make sure everyone is syncronized */
5205       int old_nproc = team->t.t_nproc; // save old value and use to update only
5206       // new threads below
5207       __kmp_initialize_team(team, new_nproc, new_icvs,
5208                             root->r.r_uber_thread->th.th_ident);
5209 
5210       /* reinitialize the threads */
5211       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5212       for (f = 0; f < team->t.t_nproc; ++f)
5213         __kmp_initialize_info(team->t.t_threads[f], team, f,
5214                               __kmp_gtid_from_tid(f, team));
5215 
5216       if (level) { // set th_task_state for new threads in nested hot team
5217         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5218         // only need to set the th_task_state for the new threads. th_task_state
5219         // for master thread will not be accurate until after this in
5220         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5221         // correct value.
5222         for (f = old_nproc; f < team->t.t_nproc; ++f)
5223           team->t.t_threads[f]->th.th_task_state =
5224               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5225       } else { // set th_task_state for new threads in non-nested hot team
5226         int old_state =
5227             team->t.t_threads[0]->th.th_task_state; // copy master's state
5228         for (f = old_nproc; f < team->t.t_nproc; ++f)
5229           team->t.t_threads[f]->th.th_task_state = old_state;
5230       }
5231 
5232 #ifdef KMP_DEBUG
5233       for (f = 0; f < team->t.t_nproc; ++f) {
5234         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5235                          team->t.t_threads[f]->th.th_team_nproc ==
5236                              team->t.t_nproc);
5237       }
5238 #endif
5239 
5240 #if OMP_40_ENABLED
5241       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5242 #if KMP_AFFINITY_SUPPORTED
5243       __kmp_partition_places(team);
5244 #endif
5245 #endif
5246     } // Check changes in number of threads
5247 
5248 #if OMP_40_ENABLED
5249     kmp_info_t *master = team->t.t_threads[0];
5250     if (master->th.th_teams_microtask) {
5251       for (f = 1; f < new_nproc; ++f) {
5252         // propagate teams construct specific info to workers
5253         kmp_info_t *thr = team->t.t_threads[f];
5254         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5255         thr->th.th_teams_level = master->th.th_teams_level;
5256         thr->th.th_teams_size = master->th.th_teams_size;
5257       }
5258     }
5259 #endif /* OMP_40_ENABLED */
5260 #if KMP_NESTED_HOT_TEAMS
5261     if (level) {
5262       // Sync barrier state for nested hot teams, not needed for outermost hot
5263       // team.
5264       for (f = 1; f < new_nproc; ++f) {
5265         kmp_info_t *thr = team->t.t_threads[f];
5266         int b;
5267         kmp_balign_t *balign = thr->th.th_bar;
5268         for (b = 0; b < bs_last_barrier; ++b) {
5269           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5270           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5271 #if USE_DEBUGGER
5272           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5273 #endif
5274         }
5275       }
5276     }
5277 #endif // KMP_NESTED_HOT_TEAMS
5278 
5279     /* reallocate space for arguments if necessary */
5280     __kmp_alloc_argv_entries(argc, team, TRUE);
5281     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5282     // The hot team re-uses the previous task team,
5283     // if untouched during the previous release->gather phase.
5284 
5285     KF_TRACE(10, (" hot_team = %p\n", team));
5286 
5287 #if KMP_DEBUG
5288     if (__kmp_tasking_mode != tskm_immediate_exec) {
5289       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5290                     "task_team[1] = %p after reinit\n",
5291                     team->t.t_task_team[0], team->t.t_task_team[1]));
5292     }
5293 #endif
5294 
5295 #if OMPT_SUPPORT
5296     __ompt_team_assign_id(team, ompt_parallel_data);
5297 #endif
5298 
5299     KMP_MB();
5300 
5301     return team;
5302   }
5303 
5304   /* next, let's try to take one from the team pool */
5305   KMP_MB();
5306   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5307     /* TODO: consider resizing undersized teams instead of reaping them, now
5308        that we have a resizing mechanism */
5309     if (team->t.t_max_nproc >= max_nproc) {
5310       /* take this team from the team pool */
5311       __kmp_team_pool = team->t.t_next_pool;
5312 
5313       /* setup the team for fresh use */
5314       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5315 
5316       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5317                     "task_team[1] %p to NULL\n",
5318                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5319       team->t.t_task_team[0] = NULL;
5320       team->t.t_task_team[1] = NULL;
5321 
5322       /* reallocate space for arguments if necessary */
5323       __kmp_alloc_argv_entries(argc, team, TRUE);
5324       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5325 
5326       KA_TRACE(
5327           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5328                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5329       { // Initialize barrier data.
5330         int b;
5331         for (b = 0; b < bs_last_barrier; ++b) {
5332           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5333 #if USE_DEBUGGER
5334           team->t.t_bar[b].b_master_arrived = 0;
5335           team->t.t_bar[b].b_team_arrived = 0;
5336 #endif
5337         }
5338       }
5339 
5340 #if OMP_40_ENABLED
5341       team->t.t_proc_bind = new_proc_bind;
5342 #endif
5343 
5344       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5345                     team->t.t_id));
5346 
5347 #if OMPT_SUPPORT
5348       __ompt_team_assign_id(team, ompt_parallel_data);
5349 #endif
5350 
5351       KMP_MB();
5352 
5353       return team;
5354     }
5355 
5356     /* reap team if it is too small, then loop back and check the next one */
5357     // not sure if this is wise, but, will be redone during the hot-teams
5358     // rewrite.
5359     /* TODO: Use technique to find the right size hot-team, don't reap them */
5360     team = __kmp_reap_team(team);
5361     __kmp_team_pool = team;
5362   }
5363 
5364   /* nothing available in the pool, no matter, make a new team! */
5365   KMP_MB();
5366   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5367 
5368   /* and set it up */
5369   team->t.t_max_nproc = max_nproc;
5370   /* NOTE well, for some reason allocating one big buffer and dividing it up
5371      seems to really hurt performance a lot on the P4, so, let's not use this */
5372   __kmp_allocate_team_arrays(team, max_nproc);
5373 
5374   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5375   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5376 
5377   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5378                 "%p to NULL\n",
5379                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5380   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5381   // memory, no need to duplicate
5382   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5383   // memory, no need to duplicate
5384 
5385   if (__kmp_storage_map) {
5386     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5387   }
5388 
5389   /* allocate space for arguments */
5390   __kmp_alloc_argv_entries(argc, team, FALSE);
5391   team->t.t_argc = argc;
5392 
5393   KA_TRACE(20,
5394            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5395             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5396   { // Initialize barrier data.
5397     int b;
5398     for (b = 0; b < bs_last_barrier; ++b) {
5399       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5400 #if USE_DEBUGGER
5401       team->t.t_bar[b].b_master_arrived = 0;
5402       team->t.t_bar[b].b_team_arrived = 0;
5403 #endif
5404     }
5405   }
5406 
5407 #if OMP_40_ENABLED
5408   team->t.t_proc_bind = new_proc_bind;
5409 #endif
5410 
5411 #if OMPT_SUPPORT
5412   __ompt_team_assign_id(team, ompt_parallel_data);
5413   team->t.ompt_serialized_team_info = NULL;
5414 #endif
5415 
5416   KMP_MB();
5417 
5418   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5419                 team->t.t_id));
5420 
5421   return team;
5422 }
5423 
5424 /* TODO implement hot-teams at all levels */
5425 /* TODO implement lazy thread release on demand (disband request) */
5426 
5427 /* free the team.  return it to the team pool.  release all the threads
5428  * associated with it */
5429 void __kmp_free_team(kmp_root_t *root,
5430                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5431   int f;
5432   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5433                 team->t.t_id));
5434 
5435   /* verify state */
5436   KMP_DEBUG_ASSERT(root);
5437   KMP_DEBUG_ASSERT(team);
5438   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5439   KMP_DEBUG_ASSERT(team->t.t_threads);
5440 
5441   int use_hot_team = team == root->r.r_hot_team;
5442 #if KMP_NESTED_HOT_TEAMS
5443   int level;
5444   kmp_hot_team_ptr_t *hot_teams;
5445   if (master) {
5446     level = team->t.t_active_level - 1;
5447     if (master->th.th_teams_microtask) { // in teams construct?
5448       if (master->th.th_teams_size.nteams > 1) {
5449         ++level; // level was not increased in teams construct for
5450         // team_of_masters
5451       }
5452       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5453           master->th.th_teams_level == team->t.t_level) {
5454         ++level; // level was not increased in teams construct for
5455         // team_of_workers before the parallel
5456       } // team->t.t_level will be increased inside parallel
5457     }
5458     hot_teams = master->th.th_hot_teams;
5459     if (level < __kmp_hot_teams_max_level) {
5460       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5461       use_hot_team = 1;
5462     }
5463   }
5464 #endif // KMP_NESTED_HOT_TEAMS
5465 
5466   /* team is done working */
5467   TCW_SYNC_PTR(team->t.t_pkfn,
5468                NULL); // Important for Debugging Support Library.
5469 #if KMP_OS_WINDOWS
5470   team->t.t_copyin_counter = 0; // init counter for possible reuse
5471 #endif
5472   // Do not reset pointer to parent team to NULL for hot teams.
5473 
5474   /* if we are non-hot team, release our threads */
5475   if (!use_hot_team) {
5476     if (__kmp_tasking_mode != tskm_immediate_exec) {
5477       // Wait for threads to reach reapable state
5478       for (f = 1; f < team->t.t_nproc; ++f) {
5479         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5480         kmp_info_t *th = team->t.t_threads[f];
5481         volatile kmp_uint32 *state = &th->th.th_reap_state;
5482         while (*state != KMP_SAFE_TO_REAP) {
5483 #if KMP_OS_WINDOWS
5484           // On Windows a thread can be killed at any time, check this
5485           DWORD ecode;
5486           if (!__kmp_is_thread_alive(th, &ecode)) {
5487             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5488             break;
5489           }
5490 #endif
5491           // first check if thread is sleeping
5492           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5493           if (fl.is_sleeping())
5494             fl.resume(__kmp_gtid_from_thread(th));
5495           KMP_CPU_PAUSE();
5496         }
5497       }
5498 
5499       // Delete task teams
5500       int tt_idx;
5501       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5502         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5503         if (task_team != NULL) {
5504           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5505             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5506             team->t.t_threads[f]->th.th_task_team = NULL;
5507           }
5508           KA_TRACE(
5509               20,
5510               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5511                __kmp_get_gtid(), task_team, team->t.t_id));
5512 #if KMP_NESTED_HOT_TEAMS
5513           __kmp_free_task_team(master, task_team);
5514 #endif
5515           team->t.t_task_team[tt_idx] = NULL;
5516         }
5517       }
5518     }
5519 
5520     // Reset pointer to parent team only for non-hot teams.
5521     team->t.t_parent = NULL;
5522     team->t.t_level = 0;
5523     team->t.t_active_level = 0;
5524 
5525     /* free the worker threads */
5526     for (f = 1; f < team->t.t_nproc; ++f) {
5527       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5528       __kmp_free_thread(team->t.t_threads[f]);
5529       team->t.t_threads[f] = NULL;
5530     }
5531 
5532     /* put the team back in the team pool */
5533     /* TODO limit size of team pool, call reap_team if pool too large */
5534     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5535     __kmp_team_pool = (volatile kmp_team_t *)team;
5536   } else { // Check if team was created for the masters in a teams construct
5537     // See if first worker is a CG root
5538     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5539                      team->t.t_threads[1]->th.th_cg_roots);
5540     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5541       // Clean up the CG root nodes on workers so that this team can be re-used
5542       for (f = 1; f < team->t.t_nproc; ++f) {
5543         kmp_info_t *thr = team->t.t_threads[f];
5544         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5545                          thr->th.th_cg_roots->cg_root == thr);
5546         // Pop current CG root off list
5547         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5548         thr->th.th_cg_roots = tmp->up;
5549         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5550                        " up to node %p. cg_nthreads was %d\n",
5551                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5552         __kmp_free(tmp);
5553         // Restore current task's thread_limit from CG root
5554         if (thr->th.th_cg_roots)
5555           thr->th.th_current_task->td_icvs.thread_limit =
5556               thr->th.th_cg_roots->cg_thread_limit;
5557       }
5558     }
5559   }
5560 
5561   KMP_MB();
5562 }
5563 
5564 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5565 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5566   kmp_team_t *next_pool = team->t.t_next_pool;
5567 
5568   KMP_DEBUG_ASSERT(team);
5569   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5570   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5571   KMP_DEBUG_ASSERT(team->t.t_threads);
5572   KMP_DEBUG_ASSERT(team->t.t_argv);
5573 
5574   /* TODO clean the threads that are a part of this? */
5575 
5576   /* free stuff */
5577   __kmp_free_team_arrays(team);
5578   if (team->t.t_argv != &team->t.t_inline_argv[0])
5579     __kmp_free((void *)team->t.t_argv);
5580   __kmp_free(team);
5581 
5582   KMP_MB();
5583   return next_pool;
5584 }
5585 
5586 // Free the thread.  Don't reap it, just place it on the pool of available
5587 // threads.
5588 //
5589 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5590 // binding for the affinity mechanism to be useful.
5591 //
5592 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5593 // However, we want to avoid a potential performance problem by always
5594 // scanning through the list to find the correct point at which to insert
5595 // the thread (potential N**2 behavior).  To do this we keep track of the
5596 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5597 // With single-level parallelism, threads will always be added to the tail
5598 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5599 // parallelism, all bets are off and we may need to scan through the entire
5600 // free list.
5601 //
5602 // This change also has a potentially large performance benefit, for some
5603 // applications.  Previously, as threads were freed from the hot team, they
5604 // would be placed back on the free list in inverse order.  If the hot team
5605 // grew back to it's original size, then the freed thread would be placed
5606 // back on the hot team in reverse order.  This could cause bad cache
5607 // locality problems on programs where the size of the hot team regularly
5608 // grew and shrunk.
5609 //
5610 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5611 void __kmp_free_thread(kmp_info_t *this_th) {
5612   int gtid;
5613   kmp_info_t **scan;
5614 
5615   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5616                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5617 
5618   KMP_DEBUG_ASSERT(this_th);
5619 
5620   // When moving thread to pool, switch thread to wait on own b_go flag, and
5621   // uninitialized (NULL team).
5622   int b;
5623   kmp_balign_t *balign = this_th->th.th_bar;
5624   for (b = 0; b < bs_last_barrier; ++b) {
5625     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5626       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5627     balign[b].bb.team = NULL;
5628     balign[b].bb.leaf_kids = 0;
5629   }
5630   this_th->th.th_task_state = 0;
5631   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5632 
5633   /* put thread back on the free pool */
5634   TCW_PTR(this_th->th.th_team, NULL);
5635   TCW_PTR(this_th->th.th_root, NULL);
5636   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5637 
5638   while (this_th->th.th_cg_roots) {
5639     this_th->th.th_cg_roots->cg_nthreads--;
5640     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5641                    " %p of thread  %p to %d\n",
5642                    this_th, this_th->th.th_cg_roots,
5643                    this_th->th.th_cg_roots->cg_root,
5644                    this_th->th.th_cg_roots->cg_nthreads));
5645     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5646     if (tmp->cg_root == this_th) { // Thread is a cg_root
5647       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5648       KA_TRACE(
5649           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5650       this_th->th.th_cg_roots = tmp->up;
5651       __kmp_free(tmp);
5652     } else { // Worker thread
5653       this_th->th.th_cg_roots = NULL;
5654       break;
5655     }
5656   }
5657 
5658   /* If the implicit task assigned to this thread can be used by other threads
5659    * -> multiple threads can share the data and try to free the task at
5660    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5661    * with higher probability when hot team is disabled but can occurs even when
5662    * the hot team is enabled */
5663   __kmp_free_implicit_task(this_th);
5664   this_th->th.th_current_task = NULL;
5665 
5666   // If the __kmp_thread_pool_insert_pt is already past the new insert
5667   // point, then we need to re-scan the entire list.
5668   gtid = this_th->th.th_info.ds.ds_gtid;
5669   if (__kmp_thread_pool_insert_pt != NULL) {
5670     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5671     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5672       __kmp_thread_pool_insert_pt = NULL;
5673     }
5674   }
5675 
5676   // Scan down the list to find the place to insert the thread.
5677   // scan is the address of a link in the list, possibly the address of
5678   // __kmp_thread_pool itself.
5679   //
5680   // In the absence of nested parallism, the for loop will have 0 iterations.
5681   if (__kmp_thread_pool_insert_pt != NULL) {
5682     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5683   } else {
5684     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5685   }
5686   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5687        scan = &((*scan)->th.th_next_pool))
5688     ;
5689 
5690   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5691   // to its address.
5692   TCW_PTR(this_th->th.th_next_pool, *scan);
5693   __kmp_thread_pool_insert_pt = *scan = this_th;
5694   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5695                    (this_th->th.th_info.ds.ds_gtid <
5696                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5697   TCW_4(this_th->th.th_in_pool, TRUE);
5698   __kmp_thread_pool_nth++;
5699 
5700   TCW_4(__kmp_nth, __kmp_nth - 1);
5701 
5702 #ifdef KMP_ADJUST_BLOCKTIME
5703   /* Adjust blocktime back to user setting or default if necessary */
5704   /* Middle initialization might never have occurred                */
5705   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5706     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5707     if (__kmp_nth <= __kmp_avail_proc) {
5708       __kmp_zero_bt = FALSE;
5709     }
5710   }
5711 #endif /* KMP_ADJUST_BLOCKTIME */
5712 
5713   KMP_MB();
5714 }
5715 
5716 /* ------------------------------------------------------------------------ */
5717 
5718 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5719   int gtid = this_thr->th.th_info.ds.ds_gtid;
5720   /*    void                 *stack_data;*/
5721   kmp_team_t *(*volatile pteam);
5722 
5723   KMP_MB();
5724   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5725 
5726   if (__kmp_env_consistency_check) {
5727     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5728   }
5729 
5730 #if OMPT_SUPPORT
5731   ompt_data_t *thread_data;
5732   if (ompt_enabled.enabled) {
5733     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5734     *thread_data = ompt_data_none;
5735 
5736     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5737     this_thr->th.ompt_thread_info.wait_id = 0;
5738     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5739     if (ompt_enabled.ompt_callback_thread_begin) {
5740       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5741           ompt_thread_worker, thread_data);
5742     }
5743   }
5744 #endif
5745 
5746 #if OMPT_SUPPORT
5747   if (ompt_enabled.enabled) {
5748     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5749   }
5750 #endif
5751   /* This is the place where threads wait for work */
5752   while (!TCR_4(__kmp_global.g.g_done)) {
5753     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5754     KMP_MB();
5755 
5756     /* wait for work to do */
5757     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5758 
5759     /* No tid yet since not part of a team */
5760     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5761 
5762 #if OMPT_SUPPORT
5763     if (ompt_enabled.enabled) {
5764       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5765     }
5766 #endif
5767 
5768     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5769 
5770     /* have we been allocated? */
5771     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5772       /* we were just woken up, so run our new task */
5773       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5774         int rc;
5775         KA_TRACE(20,
5776                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5777                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5778                   (*pteam)->t.t_pkfn));
5779 
5780         updateHWFPControl(*pteam);
5781 
5782 #if OMPT_SUPPORT
5783         if (ompt_enabled.enabled) {
5784           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5785         }
5786 #endif
5787 
5788         rc = (*pteam)->t.t_invoke(gtid);
5789         KMP_ASSERT(rc);
5790 
5791         KMP_MB();
5792         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5793                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5794                       (*pteam)->t.t_pkfn));
5795       }
5796 #if OMPT_SUPPORT
5797       if (ompt_enabled.enabled) {
5798         /* no frame set while outside task */
5799         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5800 
5801         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5802       }
5803 #endif
5804       /* join barrier after parallel region */
5805       __kmp_join_barrier(gtid);
5806     }
5807   }
5808   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5809 
5810 #if OMPT_SUPPORT
5811   if (ompt_enabled.ompt_callback_thread_end) {
5812     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5813   }
5814 #endif
5815 
5816   this_thr->th.th_task_team = NULL;
5817   /* run the destructors for the threadprivate data for this thread */
5818   __kmp_common_destroy_gtid(gtid);
5819 
5820   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5821   KMP_MB();
5822   return this_thr;
5823 }
5824 
5825 /* ------------------------------------------------------------------------ */
5826 
5827 void __kmp_internal_end_dest(void *specific_gtid) {
5828 #if KMP_COMPILER_ICC
5829 #pragma warning(push)
5830 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5831 // significant bits
5832 #endif
5833   // Make sure no significant bits are lost
5834   int gtid = (kmp_intptr_t)specific_gtid - 1;
5835 #if KMP_COMPILER_ICC
5836 #pragma warning(pop)
5837 #endif
5838 
5839   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5840   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5841    * this is because 0 is reserved for the nothing-stored case */
5842 
5843   /* josh: One reason for setting the gtid specific data even when it is being
5844      destroyed by pthread is to allow gtid lookup through thread specific data
5845      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5846      that gets executed in the call to __kmp_internal_end_thread, actually
5847      gets the gtid through the thread specific data.  Setting it here seems
5848      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5849      to run smoothly.
5850      todo: get rid of this after we remove the dependence on
5851      __kmp_gtid_get_specific  */
5852   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5853     __kmp_gtid_set_specific(gtid);
5854 #ifdef KMP_TDATA_GTID
5855   __kmp_gtid = gtid;
5856 #endif
5857   __kmp_internal_end_thread(gtid);
5858 }
5859 
5860 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5861 
5862 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5863 // destructors work perfectly, but in real libomp.so I have no evidence it is
5864 // ever called. However, -fini linker option in makefile.mk works fine.
5865 
5866 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5867   __kmp_internal_end_atexit();
5868 }
5869 
5870 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5871 
5872 #endif
5873 
5874 /* [Windows] josh: when the atexit handler is called, there may still be more
5875    than one thread alive */
5876 void __kmp_internal_end_atexit(void) {
5877   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5878   /* [Windows]
5879      josh: ideally, we want to completely shutdown the library in this atexit
5880      handler, but stat code that depends on thread specific data for gtid fails
5881      because that data becomes unavailable at some point during the shutdown, so
5882      we call __kmp_internal_end_thread instead. We should eventually remove the
5883      dependency on __kmp_get_specific_gtid in the stat code and use
5884      __kmp_internal_end_library to cleanly shutdown the library.
5885 
5886      // TODO: Can some of this comment about GVS be removed?
5887      I suspect that the offending stat code is executed when the calling thread
5888      tries to clean up a dead root thread's data structures, resulting in GVS
5889      code trying to close the GVS structures for that thread, but since the stat
5890      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5891      the calling thread is cleaning up itself instead of another thread, it get
5892      confused. This happens because allowing a thread to unregister and cleanup
5893      another thread is a recent modification for addressing an issue.
5894      Based on the current design (20050722), a thread may end up
5895      trying to unregister another thread only if thread death does not trigger
5896      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5897      thread specific data destructor function to detect thread death. For
5898      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5899      is nothing.  Thus, the workaround is applicable only for Windows static
5900      stat library. */
5901   __kmp_internal_end_library(-1);
5902 #if KMP_OS_WINDOWS
5903   __kmp_close_console();
5904 #endif
5905 }
5906 
5907 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5908   // It is assumed __kmp_forkjoin_lock is acquired.
5909 
5910   int gtid;
5911 
5912   KMP_DEBUG_ASSERT(thread != NULL);
5913 
5914   gtid = thread->th.th_info.ds.ds_gtid;
5915 
5916   if (!is_root) {
5917     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5918       /* Assume the threads are at the fork barrier here */
5919       KA_TRACE(
5920           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5921                gtid));
5922       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5923        * (GEH) */
5924       ANNOTATE_HAPPENS_BEFORE(thread);
5925       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5926       __kmp_release_64(&flag);
5927     }
5928 
5929     // Terminate OS thread.
5930     __kmp_reap_worker(thread);
5931 
5932     // The thread was killed asynchronously.  If it was actively
5933     // spinning in the thread pool, decrement the global count.
5934     //
5935     // There is a small timing hole here - if the worker thread was just waking
5936     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5937     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5938     // the global counter might not get updated.
5939     //
5940     // Currently, this can only happen as the library is unloaded,
5941     // so there are no harmful side effects.
5942     if (thread->th.th_active_in_pool) {
5943       thread->th.th_active_in_pool = FALSE;
5944       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5945       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5946     }
5947 
5948     // Decrement # of [worker] threads in the pool.
5949     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5950     --__kmp_thread_pool_nth;
5951   }
5952 
5953   __kmp_free_implicit_task(thread);
5954 
5955 // Free the fast memory for tasking
5956 #if USE_FAST_MEMORY
5957   __kmp_free_fast_memory(thread);
5958 #endif /* USE_FAST_MEMORY */
5959 
5960   __kmp_suspend_uninitialize_thread(thread);
5961 
5962   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5963   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5964 
5965   --__kmp_all_nth;
5966 // __kmp_nth was decremented when thread is added to the pool.
5967 
5968 #ifdef KMP_ADJUST_BLOCKTIME
5969   /* Adjust blocktime back to user setting or default if necessary */
5970   /* Middle initialization might never have occurred                */
5971   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5972     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5973     if (__kmp_nth <= __kmp_avail_proc) {
5974       __kmp_zero_bt = FALSE;
5975     }
5976   }
5977 #endif /* KMP_ADJUST_BLOCKTIME */
5978 
5979   /* free the memory being used */
5980   if (__kmp_env_consistency_check) {
5981     if (thread->th.th_cons) {
5982       __kmp_free_cons_stack(thread->th.th_cons);
5983       thread->th.th_cons = NULL;
5984     }
5985   }
5986 
5987   if (thread->th.th_pri_common != NULL) {
5988     __kmp_free(thread->th.th_pri_common);
5989     thread->th.th_pri_common = NULL;
5990   }
5991 
5992   if (thread->th.th_task_state_memo_stack != NULL) {
5993     __kmp_free(thread->th.th_task_state_memo_stack);
5994     thread->th.th_task_state_memo_stack = NULL;
5995   }
5996 
5997 #if KMP_USE_BGET
5998   if (thread->th.th_local.bget_data != NULL) {
5999     __kmp_finalize_bget(thread);
6000   }
6001 #endif
6002 
6003 #if KMP_AFFINITY_SUPPORTED
6004   if (thread->th.th_affin_mask != NULL) {
6005     KMP_CPU_FREE(thread->th.th_affin_mask);
6006     thread->th.th_affin_mask = NULL;
6007   }
6008 #endif /* KMP_AFFINITY_SUPPORTED */
6009 
6010 #if KMP_USE_HIER_SCHED
6011   if (thread->th.th_hier_bar_data != NULL) {
6012     __kmp_free(thread->th.th_hier_bar_data);
6013     thread->th.th_hier_bar_data = NULL;
6014   }
6015 #endif
6016 
6017   __kmp_reap_team(thread->th.th_serial_team);
6018   thread->th.th_serial_team = NULL;
6019   __kmp_free(thread);
6020 
6021   KMP_MB();
6022 
6023 } // __kmp_reap_thread
6024 
6025 static void __kmp_internal_end(void) {
6026   int i;
6027 
6028   /* First, unregister the library */
6029   __kmp_unregister_library();
6030 
6031 #if KMP_OS_WINDOWS
6032   /* In Win static library, we can't tell when a root actually dies, so we
6033      reclaim the data structures for any root threads that have died but not
6034      unregistered themselves, in order to shut down cleanly.
6035      In Win dynamic library we also can't tell when a thread dies.  */
6036   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6037 // dead roots
6038 #endif
6039 
6040   for (i = 0; i < __kmp_threads_capacity; i++)
6041     if (__kmp_root[i])
6042       if (__kmp_root[i]->r.r_active)
6043         break;
6044   KMP_MB(); /* Flush all pending memory write invalidates.  */
6045   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6046 
6047   if (i < __kmp_threads_capacity) {
6048 #if KMP_USE_MONITOR
6049     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6050     KMP_MB(); /* Flush all pending memory write invalidates.  */
6051 
6052     // Need to check that monitor was initialized before reaping it. If we are
6053     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6054     // __kmp_monitor will appear to contain valid data, but it is only valid in
6055     // the parent process, not the child.
6056     // New behavior (201008): instead of keying off of the flag
6057     // __kmp_init_parallel, the monitor thread creation is keyed off
6058     // of the new flag __kmp_init_monitor.
6059     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6060     if (TCR_4(__kmp_init_monitor)) {
6061       __kmp_reap_monitor(&__kmp_monitor);
6062       TCW_4(__kmp_init_monitor, 0);
6063     }
6064     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6065     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6066 #endif // KMP_USE_MONITOR
6067   } else {
6068 /* TODO move this to cleanup code */
6069 #ifdef KMP_DEBUG
6070     /* make sure that everything has properly ended */
6071     for (i = 0; i < __kmp_threads_capacity; i++) {
6072       if (__kmp_root[i]) {
6073         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6074         //                    there can be uber threads alive here
6075         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6076       }
6077     }
6078 #endif
6079 
6080     KMP_MB();
6081 
6082     // Reap the worker threads.
6083     // This is valid for now, but be careful if threads are reaped sooner.
6084     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6085       // Get the next thread from the pool.
6086       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6087       __kmp_thread_pool = thread->th.th_next_pool;
6088       // Reap it.
6089       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6090       thread->th.th_next_pool = NULL;
6091       thread->th.th_in_pool = FALSE;
6092       __kmp_reap_thread(thread, 0);
6093     }
6094     __kmp_thread_pool_insert_pt = NULL;
6095 
6096     // Reap teams.
6097     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6098       // Get the next team from the pool.
6099       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6100       __kmp_team_pool = team->t.t_next_pool;
6101       // Reap it.
6102       team->t.t_next_pool = NULL;
6103       __kmp_reap_team(team);
6104     }
6105 
6106     __kmp_reap_task_teams();
6107 
6108 #if KMP_OS_UNIX
6109     // Threads that are not reaped should not access any resources since they
6110     // are going to be deallocated soon, so the shutdown sequence should wait
6111     // until all threads either exit the final spin-waiting loop or begin
6112     // sleeping after the given blocktime.
6113     for (i = 0; i < __kmp_threads_capacity; i++) {
6114       kmp_info_t *thr = __kmp_threads[i];
6115       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6116         KMP_CPU_PAUSE();
6117     }
6118 #endif
6119 
6120     for (i = 0; i < __kmp_threads_capacity; ++i) {
6121       // TBD: Add some checking...
6122       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6123     }
6124 
6125     /* Make sure all threadprivate destructors get run by joining with all
6126        worker threads before resetting this flag */
6127     TCW_SYNC_4(__kmp_init_common, FALSE);
6128 
6129     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6130     KMP_MB();
6131 
6132 #if KMP_USE_MONITOR
6133     // See note above: One of the possible fixes for CQ138434 / CQ140126
6134     //
6135     // FIXME: push both code fragments down and CSE them?
6136     // push them into __kmp_cleanup() ?
6137     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6138     if (TCR_4(__kmp_init_monitor)) {
6139       __kmp_reap_monitor(&__kmp_monitor);
6140       TCW_4(__kmp_init_monitor, 0);
6141     }
6142     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6143     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6144 #endif
6145   } /* else !__kmp_global.t_active */
6146   TCW_4(__kmp_init_gtid, FALSE);
6147   KMP_MB(); /* Flush all pending memory write invalidates.  */
6148 
6149   __kmp_cleanup();
6150 #if OMPT_SUPPORT
6151   ompt_fini();
6152 #endif
6153 }
6154 
6155 void __kmp_internal_end_library(int gtid_req) {
6156   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6157   /* this shouldn't be a race condition because __kmp_internal_end() is the
6158      only place to clear __kmp_serial_init */
6159   /* we'll check this later too, after we get the lock */
6160   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6161   // redundaant, because the next check will work in any case.
6162   if (__kmp_global.g.g_abort) {
6163     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6164     /* TODO abort? */
6165     return;
6166   }
6167   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6168     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6169     return;
6170   }
6171 
6172   KMP_MB(); /* Flush all pending memory write invalidates.  */
6173 
6174   /* find out who we are and what we should do */
6175   {
6176     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6177     KA_TRACE(
6178         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6179     if (gtid == KMP_GTID_SHUTDOWN) {
6180       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6181                     "already shutdown\n"));
6182       return;
6183     } else if (gtid == KMP_GTID_MONITOR) {
6184       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6185                     "registered, or system shutdown\n"));
6186       return;
6187     } else if (gtid == KMP_GTID_DNE) {
6188       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6189                     "shutdown\n"));
6190       /* we don't know who we are, but we may still shutdown the library */
6191     } else if (KMP_UBER_GTID(gtid)) {
6192       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6193       if (__kmp_root[gtid]->r.r_active) {
6194         __kmp_global.g.g_abort = -1;
6195         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6196         KA_TRACE(10,
6197                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6198                   gtid));
6199         return;
6200       } else {
6201         KA_TRACE(
6202             10,
6203             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6204         __kmp_unregister_root_current_thread(gtid);
6205       }
6206     } else {
6207 /* worker threads may call this function through the atexit handler, if they
6208  * call exit() */
6209 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6210    TODO: do a thorough shutdown instead */
6211 #ifdef DUMP_DEBUG_ON_EXIT
6212       if (__kmp_debug_buf)
6213         __kmp_dump_debug_buffer();
6214 #endif
6215       return;
6216     }
6217   }
6218   /* synchronize the termination process */
6219   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6220 
6221   /* have we already finished */
6222   if (__kmp_global.g.g_abort) {
6223     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6224     /* TODO abort? */
6225     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6226     return;
6227   }
6228   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6229     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6230     return;
6231   }
6232 
6233   /* We need this lock to enforce mutex between this reading of
6234      __kmp_threads_capacity and the writing by __kmp_register_root.
6235      Alternatively, we can use a counter of roots that is atomically updated by
6236      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6237      __kmp_internal_end_*.  */
6238   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6239 
6240   /* now we can safely conduct the actual termination */
6241   __kmp_internal_end();
6242 
6243   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6244   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6245 
6246   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6247 
6248 #ifdef DUMP_DEBUG_ON_EXIT
6249   if (__kmp_debug_buf)
6250     __kmp_dump_debug_buffer();
6251 #endif
6252 
6253 #if KMP_OS_WINDOWS
6254   __kmp_close_console();
6255 #endif
6256 
6257   __kmp_fini_allocator();
6258 
6259 } // __kmp_internal_end_library
6260 
6261 void __kmp_internal_end_thread(int gtid_req) {
6262   int i;
6263 
6264   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6265   /* this shouldn't be a race condition because __kmp_internal_end() is the
6266    * only place to clear __kmp_serial_init */
6267   /* we'll check this later too, after we get the lock */
6268   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6269   // redundant, because the next check will work in any case.
6270   if (__kmp_global.g.g_abort) {
6271     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6272     /* TODO abort? */
6273     return;
6274   }
6275   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6276     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6277     return;
6278   }
6279 
6280   KMP_MB(); /* Flush all pending memory write invalidates.  */
6281 
6282   /* find out who we are and what we should do */
6283   {
6284     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6285     KA_TRACE(10,
6286              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6287     if (gtid == KMP_GTID_SHUTDOWN) {
6288       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6289                     "already shutdown\n"));
6290       return;
6291     } else if (gtid == KMP_GTID_MONITOR) {
6292       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6293                     "registered, or system shutdown\n"));
6294       return;
6295     } else if (gtid == KMP_GTID_DNE) {
6296       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6297                     "shutdown\n"));
6298       return;
6299       /* we don't know who we are */
6300     } else if (KMP_UBER_GTID(gtid)) {
6301       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6302       if (__kmp_root[gtid]->r.r_active) {
6303         __kmp_global.g.g_abort = -1;
6304         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6305         KA_TRACE(10,
6306                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6307                   gtid));
6308         return;
6309       } else {
6310         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6311                       gtid));
6312         __kmp_unregister_root_current_thread(gtid);
6313       }
6314     } else {
6315       /* just a worker thread, let's leave */
6316       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6317 
6318       if (gtid >= 0) {
6319         __kmp_threads[gtid]->th.th_task_team = NULL;
6320       }
6321 
6322       KA_TRACE(10,
6323                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6324                 gtid));
6325       return;
6326     }
6327   }
6328 #if KMP_DYNAMIC_LIB
6329   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6330   // thread, because we will better shutdown later in the library destructor.
6331   // The reason of this change is performance problem when non-openmp thread in
6332   // a loop forks and joins many openmp threads. We can save a lot of time
6333   // keeping worker threads alive until the program shutdown.
6334   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6335   // and Windows(DPD200287443) that occurs when using critical sections from
6336   // foreign threads.
6337   if (__kmp_pause_status != kmp_hard_paused) {
6338     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6339     return;
6340   }
6341 #endif
6342   /* synchronize the termination process */
6343   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6344 
6345   /* have we already finished */
6346   if (__kmp_global.g.g_abort) {
6347     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6348     /* TODO abort? */
6349     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6350     return;
6351   }
6352   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6353     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6354     return;
6355   }
6356 
6357   /* We need this lock to enforce mutex between this reading of
6358      __kmp_threads_capacity and the writing by __kmp_register_root.
6359      Alternatively, we can use a counter of roots that is atomically updated by
6360      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6361      __kmp_internal_end_*.  */
6362 
6363   /* should we finish the run-time?  are all siblings done? */
6364   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6365 
6366   for (i = 0; i < __kmp_threads_capacity; ++i) {
6367     if (KMP_UBER_GTID(i)) {
6368       KA_TRACE(
6369           10,
6370           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6371       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6372       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6373       return;
6374     }
6375   }
6376 
6377   /* now we can safely conduct the actual termination */
6378 
6379   __kmp_internal_end();
6380 
6381   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6382   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6383 
6384   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6385 
6386 #ifdef DUMP_DEBUG_ON_EXIT
6387   if (__kmp_debug_buf)
6388     __kmp_dump_debug_buffer();
6389 #endif
6390 } // __kmp_internal_end_thread
6391 
6392 // -----------------------------------------------------------------------------
6393 // Library registration stuff.
6394 
6395 static long __kmp_registration_flag = 0;
6396 // Random value used to indicate library initialization.
6397 static char *__kmp_registration_str = NULL;
6398 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6399 
6400 static inline char *__kmp_reg_status_name() {
6401   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6402      each thread. If registration and unregistration go in different threads
6403      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6404      env var can not be found, because the name will contain different pid. */
6405   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6406 } // __kmp_reg_status_get
6407 
6408 void __kmp_register_library_startup(void) {
6409 
6410   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6411   int done = 0;
6412   union {
6413     double dtime;
6414     long ltime;
6415   } time;
6416 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6417   __kmp_initialize_system_tick();
6418 #endif
6419   __kmp_read_system_time(&time.dtime);
6420   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6421   __kmp_registration_str =
6422       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6423                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6424 
6425   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6426                 __kmp_registration_str));
6427 
6428   while (!done) {
6429 
6430     char *value = NULL; // Actual value of the environment variable.
6431 
6432     // Set environment variable, but do not overwrite if it is exist.
6433     __kmp_env_set(name, __kmp_registration_str, 0);
6434     // Check the variable is written.
6435     value = __kmp_env_get(name);
6436     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6437 
6438       done = 1; // Ok, environment variable set successfully, exit the loop.
6439 
6440     } else {
6441 
6442       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6443       // Check whether it alive or dead.
6444       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6445       char *tail = value;
6446       char *flag_addr_str = NULL;
6447       char *flag_val_str = NULL;
6448       char const *file_name = NULL;
6449       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6450       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6451       file_name = tail;
6452       if (tail != NULL) {
6453         long *flag_addr = 0;
6454         long flag_val = 0;
6455         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6456         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6457         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6458           // First, check whether environment-encoded address is mapped into
6459           // addr space.
6460           // If so, dereference it to see if it still has the right value.
6461           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6462             neighbor = 1;
6463           } else {
6464             // If not, then we know the other copy of the library is no longer
6465             // running.
6466             neighbor = 2;
6467           }
6468         }
6469       }
6470       switch (neighbor) {
6471       case 0: // Cannot parse environment variable -- neighbor status unknown.
6472         // Assume it is the incompatible format of future version of the
6473         // library. Assume the other library is alive.
6474         // WARN( ... ); // TODO: Issue a warning.
6475         file_name = "unknown library";
6476         KMP_FALLTHROUGH();
6477       // Attention! Falling to the next case. That's intentional.
6478       case 1: { // Neighbor is alive.
6479         // Check it is allowed.
6480         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6481         if (!__kmp_str_match_true(duplicate_ok)) {
6482           // That's not allowed. Issue fatal error.
6483           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6484                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6485         }
6486         KMP_INTERNAL_FREE(duplicate_ok);
6487         __kmp_duplicate_library_ok = 1;
6488         done = 1; // Exit the loop.
6489       } break;
6490       case 2: { // Neighbor is dead.
6491         // Clear the variable and try to register library again.
6492         __kmp_env_unset(name);
6493       } break;
6494       default: { KMP_DEBUG_ASSERT(0); } break;
6495       }
6496     }
6497     KMP_INTERNAL_FREE((void *)value);
6498   }
6499   KMP_INTERNAL_FREE((void *)name);
6500 
6501 } // func __kmp_register_library_startup
6502 
6503 void __kmp_unregister_library(void) {
6504 
6505   char *name = __kmp_reg_status_name();
6506   char *value = __kmp_env_get(name);
6507 
6508   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6509   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6510   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6511     // Ok, this is our variable. Delete it.
6512     __kmp_env_unset(name);
6513   }
6514 
6515   KMP_INTERNAL_FREE(__kmp_registration_str);
6516   KMP_INTERNAL_FREE(value);
6517   KMP_INTERNAL_FREE(name);
6518 
6519   __kmp_registration_flag = 0;
6520   __kmp_registration_str = NULL;
6521 
6522 } // __kmp_unregister_library
6523 
6524 // End of Library registration stuff.
6525 // -----------------------------------------------------------------------------
6526 
6527 #if KMP_MIC_SUPPORTED
6528 
6529 static void __kmp_check_mic_type() {
6530   kmp_cpuid_t cpuid_state = {0};
6531   kmp_cpuid_t *cs_p = &cpuid_state;
6532   __kmp_x86_cpuid(1, 0, cs_p);
6533   // We don't support mic1 at the moment
6534   if ((cs_p->eax & 0xff0) == 0xB10) {
6535     __kmp_mic_type = mic2;
6536   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6537     __kmp_mic_type = mic3;
6538   } else {
6539     __kmp_mic_type = non_mic;
6540   }
6541 }
6542 
6543 #endif /* KMP_MIC_SUPPORTED */
6544 
6545 static void __kmp_do_serial_initialize(void) {
6546   int i, gtid;
6547   int size;
6548 
6549   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6550 
6551   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6552   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6553   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6554   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6555   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6556 
6557 #if OMPT_SUPPORT
6558   ompt_pre_init();
6559 #endif
6560 
6561   __kmp_validate_locks();
6562 
6563   /* Initialize internal memory allocator */
6564   __kmp_init_allocator();
6565 
6566   /* Register the library startup via an environment variable and check to see
6567      whether another copy of the library is already registered. */
6568 
6569   __kmp_register_library_startup();
6570 
6571   /* TODO reinitialization of library */
6572   if (TCR_4(__kmp_global.g.g_done)) {
6573     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6574   }
6575 
6576   __kmp_global.g.g_abort = 0;
6577   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6578 
6579 /* initialize the locks */
6580 #if KMP_USE_ADAPTIVE_LOCKS
6581 #if KMP_DEBUG_ADAPTIVE_LOCKS
6582   __kmp_init_speculative_stats();
6583 #endif
6584 #endif
6585 #if KMP_STATS_ENABLED
6586   __kmp_stats_init();
6587 #endif
6588   __kmp_init_lock(&__kmp_global_lock);
6589   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6590   __kmp_init_lock(&__kmp_debug_lock);
6591   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6592   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6593   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6594   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6595   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6596   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6597   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6598   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6599   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6600   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6601   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6602   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6603   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6604   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6605   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6606 #if KMP_USE_MONITOR
6607   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6608 #endif
6609   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6610 
6611   /* conduct initialization and initial setup of configuration */
6612 
6613   __kmp_runtime_initialize();
6614 
6615 #if KMP_MIC_SUPPORTED
6616   __kmp_check_mic_type();
6617 #endif
6618 
6619 // Some global variable initialization moved here from kmp_env_initialize()
6620 #ifdef KMP_DEBUG
6621   kmp_diag = 0;
6622 #endif
6623   __kmp_abort_delay = 0;
6624 
6625   // From __kmp_init_dflt_team_nth()
6626   /* assume the entire machine will be used */
6627   __kmp_dflt_team_nth_ub = __kmp_xproc;
6628   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6629     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6630   }
6631   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6632     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6633   }
6634   __kmp_max_nth = __kmp_sys_max_nth;
6635   __kmp_cg_max_nth = __kmp_sys_max_nth;
6636   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6637   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6638     __kmp_teams_max_nth = __kmp_sys_max_nth;
6639   }
6640 
6641   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6642   // part
6643   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6644 #if KMP_USE_MONITOR
6645   __kmp_monitor_wakeups =
6646       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6647   __kmp_bt_intervals =
6648       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6649 #endif
6650   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6651   __kmp_library = library_throughput;
6652   // From KMP_SCHEDULE initialization
6653   __kmp_static = kmp_sch_static_balanced;
6654 // AC: do not use analytical here, because it is non-monotonous
6655 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6656 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6657 // need to repeat assignment
6658 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6659 // bit control and barrier method control parts
6660 #if KMP_FAST_REDUCTION_BARRIER
6661 #define kmp_reduction_barrier_gather_bb ((int)1)
6662 #define kmp_reduction_barrier_release_bb ((int)1)
6663 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6664 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6665 #endif // KMP_FAST_REDUCTION_BARRIER
6666   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6667     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6668     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6669     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6670     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6671 #if KMP_FAST_REDUCTION_BARRIER
6672     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6673       // lin_64 ): hyper,1
6674       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6675       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6676       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6677       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6678     }
6679 #endif // KMP_FAST_REDUCTION_BARRIER
6680   }
6681 #if KMP_FAST_REDUCTION_BARRIER
6682 #undef kmp_reduction_barrier_release_pat
6683 #undef kmp_reduction_barrier_gather_pat
6684 #undef kmp_reduction_barrier_release_bb
6685 #undef kmp_reduction_barrier_gather_bb
6686 #endif // KMP_FAST_REDUCTION_BARRIER
6687 #if KMP_MIC_SUPPORTED
6688   if (__kmp_mic_type == mic2) { // KNC
6689     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6690     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6691     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6692         1; // forkjoin release
6693     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6694     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6695   }
6696 #if KMP_FAST_REDUCTION_BARRIER
6697   if (__kmp_mic_type == mic2) { // KNC
6698     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6699     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6700   }
6701 #endif // KMP_FAST_REDUCTION_BARRIER
6702 #endif // KMP_MIC_SUPPORTED
6703 
6704 // From KMP_CHECKS initialization
6705 #ifdef KMP_DEBUG
6706   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6707 #else
6708   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6709 #endif
6710 
6711   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6712   __kmp_foreign_tp = TRUE;
6713 
6714   __kmp_global.g.g_dynamic = FALSE;
6715   __kmp_global.g.g_dynamic_mode = dynamic_default;
6716 
6717   __kmp_env_initialize(NULL);
6718 
6719 // Print all messages in message catalog for testing purposes.
6720 #ifdef KMP_DEBUG
6721   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6722   if (__kmp_str_match_true(val)) {
6723     kmp_str_buf_t buffer;
6724     __kmp_str_buf_init(&buffer);
6725     __kmp_i18n_dump_catalog(&buffer);
6726     __kmp_printf("%s", buffer.str);
6727     __kmp_str_buf_free(&buffer);
6728   }
6729   __kmp_env_free(&val);
6730 #endif
6731 
6732   __kmp_threads_capacity =
6733       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6734   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6735   __kmp_tp_capacity = __kmp_default_tp_capacity(
6736       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6737 
6738   // If the library is shut down properly, both pools must be NULL. Just in
6739   // case, set them to NULL -- some memory may leak, but subsequent code will
6740   // work even if pools are not freed.
6741   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6742   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6743   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6744   __kmp_thread_pool = NULL;
6745   __kmp_thread_pool_insert_pt = NULL;
6746   __kmp_team_pool = NULL;
6747 
6748   /* Allocate all of the variable sized records */
6749   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6750    * expandable */
6751   /* Since allocation is cache-aligned, just add extra padding at the end */
6752   size =
6753       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6754       CACHE_LINE;
6755   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6756   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6757                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6758 
6759   /* init thread counts */
6760   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6761                    0); // Asserts fail if the library is reinitializing and
6762   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6763   __kmp_all_nth = 0;
6764   __kmp_nth = 0;
6765 
6766   /* setup the uber master thread and hierarchy */
6767   gtid = __kmp_register_root(TRUE);
6768   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6769   KMP_ASSERT(KMP_UBER_GTID(gtid));
6770   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6771 
6772   KMP_MB(); /* Flush all pending memory write invalidates.  */
6773 
6774   __kmp_common_initialize();
6775 
6776 #if KMP_OS_UNIX
6777   /* invoke the child fork handler */
6778   __kmp_register_atfork();
6779 #endif
6780 
6781 #if !KMP_DYNAMIC_LIB
6782   {
6783     /* Invoke the exit handler when the program finishes, only for static
6784        library. For dynamic library, we already have _fini and DllMain. */
6785     int rc = atexit(__kmp_internal_end_atexit);
6786     if (rc != 0) {
6787       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6788                   __kmp_msg_null);
6789     }
6790   }
6791 #endif
6792 
6793 #if KMP_HANDLE_SIGNALS
6794 #if KMP_OS_UNIX
6795   /* NOTE: make sure that this is called before the user installs their own
6796      signal handlers so that the user handlers are called first. this way they
6797      can return false, not call our handler, avoid terminating the library, and
6798      continue execution where they left off. */
6799   __kmp_install_signals(FALSE);
6800 #endif /* KMP_OS_UNIX */
6801 #if KMP_OS_WINDOWS
6802   __kmp_install_signals(TRUE);
6803 #endif /* KMP_OS_WINDOWS */
6804 #endif
6805 
6806   /* we have finished the serial initialization */
6807   __kmp_init_counter++;
6808 
6809   __kmp_init_serial = TRUE;
6810 
6811   if (__kmp_settings) {
6812     __kmp_env_print();
6813   }
6814 
6815 #if OMP_40_ENABLED
6816   if (__kmp_display_env || __kmp_display_env_verbose) {
6817     __kmp_env_print_2();
6818   }
6819 #endif // OMP_40_ENABLED
6820 
6821 #if OMPT_SUPPORT
6822   ompt_post_init();
6823 #endif
6824 
6825   KMP_MB();
6826 
6827   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6828 }
6829 
6830 void __kmp_serial_initialize(void) {
6831   if (__kmp_init_serial) {
6832     return;
6833   }
6834   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6835   if (__kmp_init_serial) {
6836     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6837     return;
6838   }
6839   __kmp_do_serial_initialize();
6840   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6841 }
6842 
6843 static void __kmp_do_middle_initialize(void) {
6844   int i, j;
6845   int prev_dflt_team_nth;
6846 
6847   if (!__kmp_init_serial) {
6848     __kmp_do_serial_initialize();
6849   }
6850 
6851   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6852 
6853   // Save the previous value for the __kmp_dflt_team_nth so that
6854   // we can avoid some reinitialization if it hasn't changed.
6855   prev_dflt_team_nth = __kmp_dflt_team_nth;
6856 
6857 #if KMP_AFFINITY_SUPPORTED
6858   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6859   // number of cores on the machine.
6860   __kmp_affinity_initialize();
6861 
6862   // Run through the __kmp_threads array and set the affinity mask
6863   // for each root thread that is currently registered with the RTL.
6864   for (i = 0; i < __kmp_threads_capacity; i++) {
6865     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6866       __kmp_affinity_set_init_mask(i, TRUE);
6867     }
6868   }
6869 #endif /* KMP_AFFINITY_SUPPORTED */
6870 
6871   KMP_ASSERT(__kmp_xproc > 0);
6872   if (__kmp_avail_proc == 0) {
6873     __kmp_avail_proc = __kmp_xproc;
6874   }
6875 
6876   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6877   // correct them now
6878   j = 0;
6879   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6880     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6881         __kmp_avail_proc;
6882     j++;
6883   }
6884 
6885   if (__kmp_dflt_team_nth == 0) {
6886 #ifdef KMP_DFLT_NTH_CORES
6887     // Default #threads = #cores
6888     __kmp_dflt_team_nth = __kmp_ncores;
6889     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6890                   "__kmp_ncores (%d)\n",
6891                   __kmp_dflt_team_nth));
6892 #else
6893     // Default #threads = #available OS procs
6894     __kmp_dflt_team_nth = __kmp_avail_proc;
6895     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6896                   "__kmp_avail_proc(%d)\n",
6897                   __kmp_dflt_team_nth));
6898 #endif /* KMP_DFLT_NTH_CORES */
6899   }
6900 
6901   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6902     __kmp_dflt_team_nth = KMP_MIN_NTH;
6903   }
6904   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6905     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6906   }
6907 
6908   // There's no harm in continuing if the following check fails,
6909   // but it indicates an error in the previous logic.
6910   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6911 
6912   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6913     // Run through the __kmp_threads array and set the num threads icv for each
6914     // root thread that is currently registered with the RTL (which has not
6915     // already explicitly set its nthreads-var with a call to
6916     // omp_set_num_threads()).
6917     for (i = 0; i < __kmp_threads_capacity; i++) {
6918       kmp_info_t *thread = __kmp_threads[i];
6919       if (thread == NULL)
6920         continue;
6921       if (thread->th.th_current_task->td_icvs.nproc != 0)
6922         continue;
6923 
6924       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6925     }
6926   }
6927   KA_TRACE(
6928       20,
6929       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6930        __kmp_dflt_team_nth));
6931 
6932 #ifdef KMP_ADJUST_BLOCKTIME
6933   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6934   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6935     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6936     if (__kmp_nth > __kmp_avail_proc) {
6937       __kmp_zero_bt = TRUE;
6938     }
6939   }
6940 #endif /* KMP_ADJUST_BLOCKTIME */
6941 
6942   /* we have finished middle initialization */
6943   TCW_SYNC_4(__kmp_init_middle, TRUE);
6944 
6945   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6946 }
6947 
6948 void __kmp_middle_initialize(void) {
6949   if (__kmp_init_middle) {
6950     return;
6951   }
6952   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6953   if (__kmp_init_middle) {
6954     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6955     return;
6956   }
6957   __kmp_do_middle_initialize();
6958   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6959 }
6960 
6961 void __kmp_parallel_initialize(void) {
6962   int gtid = __kmp_entry_gtid(); // this might be a new root
6963 
6964   /* synchronize parallel initialization (for sibling) */
6965   if (TCR_4(__kmp_init_parallel))
6966     return;
6967   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6968   if (TCR_4(__kmp_init_parallel)) {
6969     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6970     return;
6971   }
6972 
6973   /* TODO reinitialization after we have already shut down */
6974   if (TCR_4(__kmp_global.g.g_done)) {
6975     KA_TRACE(
6976         10,
6977         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6978     __kmp_infinite_loop();
6979   }
6980 
6981   /* jc: The lock __kmp_initz_lock is already held, so calling
6982      __kmp_serial_initialize would cause a deadlock.  So we call
6983      __kmp_do_serial_initialize directly. */
6984   if (!__kmp_init_middle) {
6985     __kmp_do_middle_initialize();
6986   }
6987 
6988 #if OMP_50_ENABLED
6989   __kmp_resume_if_hard_paused();
6990 #endif
6991 
6992   /* begin initialization */
6993   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6994   KMP_ASSERT(KMP_UBER_GTID(gtid));
6995 
6996 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6997   // Save the FP control regs.
6998   // Worker threads will set theirs to these values at thread startup.
6999   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7000   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7001   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7002 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7003 
7004 #if KMP_OS_UNIX
7005 #if KMP_HANDLE_SIGNALS
7006   /*  must be after __kmp_serial_initialize  */
7007   __kmp_install_signals(TRUE);
7008 #endif
7009 #endif
7010 
7011   __kmp_suspend_initialize();
7012 
7013 #if defined(USE_LOAD_BALANCE)
7014   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7015     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7016   }
7017 #else
7018   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7019     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7020   }
7021 #endif
7022 
7023   if (__kmp_version) {
7024     __kmp_print_version_2();
7025   }
7026 
7027   /* we have finished parallel initialization */
7028   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7029 
7030   KMP_MB();
7031   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7032 
7033   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7034 }
7035 
7036 /* ------------------------------------------------------------------------ */
7037 
7038 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7039                                    kmp_team_t *team) {
7040   kmp_disp_t *dispatch;
7041 
7042   KMP_MB();
7043 
7044   /* none of the threads have encountered any constructs, yet. */
7045   this_thr->th.th_local.this_construct = 0;
7046 #if KMP_CACHE_MANAGE
7047   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7048 #endif /* KMP_CACHE_MANAGE */
7049   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7050   KMP_DEBUG_ASSERT(dispatch);
7051   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7052   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7053   // this_thr->th.th_info.ds.ds_tid ] );
7054 
7055   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7056 #if OMP_45_ENABLED
7057   dispatch->th_doacross_buf_idx =
7058       0; /* reset the doacross dispatch buffer counter */
7059 #endif
7060   if (__kmp_env_consistency_check)
7061     __kmp_push_parallel(gtid, team->t.t_ident);
7062 
7063   KMP_MB(); /* Flush all pending memory write invalidates.  */
7064 }
7065 
7066 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7067                                   kmp_team_t *team) {
7068   if (__kmp_env_consistency_check)
7069     __kmp_pop_parallel(gtid, team->t.t_ident);
7070 
7071   __kmp_finish_implicit_task(this_thr);
7072 }
7073 
7074 int __kmp_invoke_task_func(int gtid) {
7075   int rc;
7076   int tid = __kmp_tid_from_gtid(gtid);
7077   kmp_info_t *this_thr = __kmp_threads[gtid];
7078   kmp_team_t *team = this_thr->th.th_team;
7079 
7080   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7081 #if USE_ITT_BUILD
7082   if (__itt_stack_caller_create_ptr) {
7083     __kmp_itt_stack_callee_enter(
7084         (__itt_caller)
7085             team->t.t_stack_id); // inform ittnotify about entering user's code
7086   }
7087 #endif /* USE_ITT_BUILD */
7088 #if INCLUDE_SSC_MARKS
7089   SSC_MARK_INVOKING();
7090 #endif
7091 
7092 #if OMPT_SUPPORT
7093   void *dummy;
7094   void **exit_runtime_p;
7095   ompt_data_t *my_task_data;
7096   ompt_data_t *my_parallel_data;
7097   int ompt_team_size;
7098 
7099   if (ompt_enabled.enabled) {
7100     exit_runtime_p = &(
7101         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7102   } else {
7103     exit_runtime_p = &dummy;
7104   }
7105 
7106   my_task_data =
7107       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7108   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7109   if (ompt_enabled.ompt_callback_implicit_task) {
7110     ompt_team_size = team->t.t_nproc;
7111     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7112         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7113         __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7114     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7115   }
7116 #endif
7117 
7118   {
7119     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7120     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7121     rc =
7122         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7123                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
7124 #if OMPT_SUPPORT
7125                                ,
7126                                exit_runtime_p
7127 #endif
7128                                );
7129 #if OMPT_SUPPORT
7130     *exit_runtime_p = NULL;
7131 #endif
7132   }
7133 
7134 #if USE_ITT_BUILD
7135   if (__itt_stack_caller_create_ptr) {
7136     __kmp_itt_stack_callee_leave(
7137         (__itt_caller)
7138             team->t.t_stack_id); // inform ittnotify about leaving user's code
7139   }
7140 #endif /* USE_ITT_BUILD */
7141   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7142 
7143   return rc;
7144 }
7145 
7146 #if OMP_40_ENABLED
7147 void __kmp_teams_master(int gtid) {
7148   // This routine is called by all master threads in teams construct
7149   kmp_info_t *thr = __kmp_threads[gtid];
7150   kmp_team_t *team = thr->th.th_team;
7151   ident_t *loc = team->t.t_ident;
7152   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7153   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7154   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7155   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7156                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7157 
7158   // This thread is a new CG root.  Set up the proper variables.
7159   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7160   tmp->cg_root = thr; // Make thr the CG root
7161   // Init to thread limit that was stored when league masters were forked
7162   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7163   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7164   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7165                  " cg_threads to 1\n",
7166                  thr, tmp));
7167   tmp->up = thr->th.th_cg_roots;
7168   thr->th.th_cg_roots = tmp;
7169 
7170 // Launch league of teams now, but not let workers execute
7171 // (they hang on fork barrier until next parallel)
7172 #if INCLUDE_SSC_MARKS
7173   SSC_MARK_FORKING();
7174 #endif
7175   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7176                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7177                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7178 #if INCLUDE_SSC_MARKS
7179   SSC_MARK_JOINING();
7180 #endif
7181   // If the team size was reduced from the limit, set it to the new size
7182   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7183     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7184   // AC: last parameter "1" eliminates join barrier which won't work because
7185   // worker threads are in a fork barrier waiting for more parallel regions
7186   __kmp_join_call(loc, gtid
7187 #if OMPT_SUPPORT
7188                   ,
7189                   fork_context_intel
7190 #endif
7191                   ,
7192                   1);
7193 }
7194 
7195 int __kmp_invoke_teams_master(int gtid) {
7196   kmp_info_t *this_thr = __kmp_threads[gtid];
7197   kmp_team_t *team = this_thr->th.th_team;
7198 #if KMP_DEBUG
7199   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7200     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7201                      (void *)__kmp_teams_master);
7202 #endif
7203   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7204   __kmp_teams_master(gtid);
7205   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7206   return 1;
7207 }
7208 #endif /* OMP_40_ENABLED */
7209 
7210 /* this sets the requested number of threads for the next parallel region
7211    encountered by this team. since this should be enclosed in the forkjoin
7212    critical section it should avoid race conditions with assymmetrical nested
7213    parallelism */
7214 
7215 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7216   kmp_info_t *thr = __kmp_threads[gtid];
7217 
7218   if (num_threads > 0)
7219     thr->th.th_set_nproc = num_threads;
7220 }
7221 
7222 #if OMP_40_ENABLED
7223 
7224 /* this sets the requested number of teams for the teams region and/or
7225    the number of threads for the next parallel region encountered  */
7226 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7227                           int num_threads) {
7228   kmp_info_t *thr = __kmp_threads[gtid];
7229   KMP_DEBUG_ASSERT(num_teams >= 0);
7230   KMP_DEBUG_ASSERT(num_threads >= 0);
7231 
7232   if (num_teams == 0)
7233     num_teams = 1; // default number of teams is 1.
7234   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7235     if (!__kmp_reserve_warn) {
7236       __kmp_reserve_warn = 1;
7237       __kmp_msg(kmp_ms_warning,
7238                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7239                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7240     }
7241     num_teams = __kmp_teams_max_nth;
7242   }
7243   // Set number of teams (number of threads in the outer "parallel" of the
7244   // teams)
7245   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7246 
7247   // Remember the number of threads for inner parallel regions
7248   if (num_threads == 0) {
7249     if (!TCR_4(__kmp_init_middle))
7250       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7251     num_threads = __kmp_avail_proc / num_teams;
7252     if (num_teams * num_threads > __kmp_teams_max_nth) {
7253       // adjust num_threads w/o warning as it is not user setting
7254       num_threads = __kmp_teams_max_nth / num_teams;
7255     }
7256   } else {
7257     // This thread will be the master of the league masters
7258     // Store new thread limit; old limit is saved in th_cg_roots list
7259     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7260 
7261     if (num_teams * num_threads > __kmp_teams_max_nth) {
7262       int new_threads = __kmp_teams_max_nth / num_teams;
7263       if (!__kmp_reserve_warn) { // user asked for too many threads
7264         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7265         __kmp_msg(kmp_ms_warning,
7266                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7267                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7268       }
7269       num_threads = new_threads;
7270     }
7271   }
7272   thr->th.th_teams_size.nth = num_threads;
7273 }
7274 
7275 // Set the proc_bind var to use in the following parallel region.
7276 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7277   kmp_info_t *thr = __kmp_threads[gtid];
7278   thr->th.th_set_proc_bind = proc_bind;
7279 }
7280 
7281 #endif /* OMP_40_ENABLED */
7282 
7283 /* Launch the worker threads into the microtask. */
7284 
7285 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7286   kmp_info_t *this_thr = __kmp_threads[gtid];
7287 
7288 #ifdef KMP_DEBUG
7289   int f;
7290 #endif /* KMP_DEBUG */
7291 
7292   KMP_DEBUG_ASSERT(team);
7293   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7294   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7295   KMP_MB(); /* Flush all pending memory write invalidates.  */
7296 
7297   team->t.t_construct = 0; /* no single directives seen yet */
7298   team->t.t_ordered.dt.t_value =
7299       0; /* thread 0 enters the ordered section first */
7300 
7301   /* Reset the identifiers on the dispatch buffer */
7302   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7303   if (team->t.t_max_nproc > 1) {
7304     int i;
7305     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7306       team->t.t_disp_buffer[i].buffer_index = i;
7307 #if OMP_45_ENABLED
7308       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7309 #endif
7310     }
7311   } else {
7312     team->t.t_disp_buffer[0].buffer_index = 0;
7313 #if OMP_45_ENABLED
7314     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7315 #endif
7316   }
7317 
7318   KMP_MB(); /* Flush all pending memory write invalidates.  */
7319   KMP_ASSERT(this_thr->th.th_team == team);
7320 
7321 #ifdef KMP_DEBUG
7322   for (f = 0; f < team->t.t_nproc; f++) {
7323     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7324                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7325   }
7326 #endif /* KMP_DEBUG */
7327 
7328   /* release the worker threads so they may begin working */
7329   __kmp_fork_barrier(gtid, 0);
7330 }
7331 
7332 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7333   kmp_info_t *this_thr = __kmp_threads[gtid];
7334 
7335   KMP_DEBUG_ASSERT(team);
7336   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7337   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7338   KMP_MB(); /* Flush all pending memory write invalidates.  */
7339 
7340 /* Join barrier after fork */
7341 
7342 #ifdef KMP_DEBUG
7343   if (__kmp_threads[gtid] &&
7344       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7345     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7346                  __kmp_threads[gtid]);
7347     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7348                  "team->t.t_nproc=%d\n",
7349                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7350                  team->t.t_nproc);
7351     __kmp_print_structure();
7352   }
7353   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7354                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7355 #endif /* KMP_DEBUG */
7356 
7357   __kmp_join_barrier(gtid); /* wait for everyone */
7358 #if OMPT_SUPPORT
7359   if (ompt_enabled.enabled &&
7360       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7361     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7362     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7363     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7364 #if OMPT_OPTIONAL
7365     void *codeptr = NULL;
7366     if (KMP_MASTER_TID(ds_tid) &&
7367         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7368          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7369       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7370 
7371     if (ompt_enabled.ompt_callback_sync_region_wait) {
7372       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7373           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7374           codeptr);
7375     }
7376     if (ompt_enabled.ompt_callback_sync_region) {
7377       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7378           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7379           codeptr);
7380     }
7381 #endif
7382     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7383       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7384           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7385     }
7386   }
7387 #endif
7388 
7389   KMP_MB(); /* Flush all pending memory write invalidates.  */
7390   KMP_ASSERT(this_thr->th.th_team == team);
7391 }
7392 
7393 /* ------------------------------------------------------------------------ */
7394 
7395 #ifdef USE_LOAD_BALANCE
7396 
7397 // Return the worker threads actively spinning in the hot team, if we
7398 // are at the outermost level of parallelism.  Otherwise, return 0.
7399 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7400   int i;
7401   int retval;
7402   kmp_team_t *hot_team;
7403 
7404   if (root->r.r_active) {
7405     return 0;
7406   }
7407   hot_team = root->r.r_hot_team;
7408   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7409     return hot_team->t.t_nproc - 1; // Don't count master thread
7410   }
7411 
7412   // Skip the master thread - it is accounted for elsewhere.
7413   retval = 0;
7414   for (i = 1; i < hot_team->t.t_nproc; i++) {
7415     if (hot_team->t.t_threads[i]->th.th_active) {
7416       retval++;
7417     }
7418   }
7419   return retval;
7420 }
7421 
7422 // Perform an automatic adjustment to the number of
7423 // threads used by the next parallel region.
7424 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7425   int retval;
7426   int pool_active;
7427   int hot_team_active;
7428   int team_curr_active;
7429   int system_active;
7430 
7431   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7432                 set_nproc));
7433   KMP_DEBUG_ASSERT(root);
7434   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7435                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7436   KMP_DEBUG_ASSERT(set_nproc > 1);
7437 
7438   if (set_nproc == 1) {
7439     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7440     return 1;
7441   }
7442 
7443   // Threads that are active in the thread pool, active in the hot team for this
7444   // particular root (if we are at the outer par level), and the currently
7445   // executing thread (to become the master) are available to add to the new
7446   // team, but are currently contributing to the system load, and must be
7447   // accounted for.
7448   pool_active = __kmp_thread_pool_active_nth;
7449   hot_team_active = __kmp_active_hot_team_nproc(root);
7450   team_curr_active = pool_active + hot_team_active + 1;
7451 
7452   // Check the system load.
7453   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7454   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7455                 "hot team active = %d\n",
7456                 system_active, pool_active, hot_team_active));
7457 
7458   if (system_active < 0) {
7459     // There was an error reading the necessary info from /proc, so use the
7460     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7461     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7462     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7463     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7464 
7465     // Make this call behave like the thread limit algorithm.
7466     retval = __kmp_avail_proc - __kmp_nth +
7467              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7468     if (retval > set_nproc) {
7469       retval = set_nproc;
7470     }
7471     if (retval < KMP_MIN_NTH) {
7472       retval = KMP_MIN_NTH;
7473     }
7474 
7475     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7476                   retval));
7477     return retval;
7478   }
7479 
7480   // There is a slight delay in the load balance algorithm in detecting new
7481   // running procs. The real system load at this instant should be at least as
7482   // large as the #active omp thread that are available to add to the team.
7483   if (system_active < team_curr_active) {
7484     system_active = team_curr_active;
7485   }
7486   retval = __kmp_avail_proc - system_active + team_curr_active;
7487   if (retval > set_nproc) {
7488     retval = set_nproc;
7489   }
7490   if (retval < KMP_MIN_NTH) {
7491     retval = KMP_MIN_NTH;
7492   }
7493 
7494   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7495   return retval;
7496 } // __kmp_load_balance_nproc()
7497 
7498 #endif /* USE_LOAD_BALANCE */
7499 
7500 /* ------------------------------------------------------------------------ */
7501 
7502 /* NOTE: this is called with the __kmp_init_lock held */
7503 void __kmp_cleanup(void) {
7504   int f;
7505 
7506   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7507 
7508   if (TCR_4(__kmp_init_parallel)) {
7509 #if KMP_HANDLE_SIGNALS
7510     __kmp_remove_signals();
7511 #endif
7512     TCW_4(__kmp_init_parallel, FALSE);
7513   }
7514 
7515   if (TCR_4(__kmp_init_middle)) {
7516 #if KMP_AFFINITY_SUPPORTED
7517     __kmp_affinity_uninitialize();
7518 #endif /* KMP_AFFINITY_SUPPORTED */
7519     __kmp_cleanup_hierarchy();
7520     TCW_4(__kmp_init_middle, FALSE);
7521   }
7522 
7523   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7524 
7525   if (__kmp_init_serial) {
7526     __kmp_runtime_destroy();
7527     __kmp_init_serial = FALSE;
7528   }
7529 
7530   __kmp_cleanup_threadprivate_caches();
7531 
7532   for (f = 0; f < __kmp_threads_capacity; f++) {
7533     if (__kmp_root[f] != NULL) {
7534       __kmp_free(__kmp_root[f]);
7535       __kmp_root[f] = NULL;
7536     }
7537   }
7538   __kmp_free(__kmp_threads);
7539   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7540   // there is no need in freeing __kmp_root.
7541   __kmp_threads = NULL;
7542   __kmp_root = NULL;
7543   __kmp_threads_capacity = 0;
7544 
7545 #if KMP_USE_DYNAMIC_LOCK
7546   __kmp_cleanup_indirect_user_locks();
7547 #else
7548   __kmp_cleanup_user_locks();
7549 #endif
7550 
7551 #if KMP_AFFINITY_SUPPORTED
7552   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7553   __kmp_cpuinfo_file = NULL;
7554 #endif /* KMP_AFFINITY_SUPPORTED */
7555 
7556 #if KMP_USE_ADAPTIVE_LOCKS
7557 #if KMP_DEBUG_ADAPTIVE_LOCKS
7558   __kmp_print_speculative_stats();
7559 #endif
7560 #endif
7561   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7562   __kmp_nested_nth.nth = NULL;
7563   __kmp_nested_nth.size = 0;
7564   __kmp_nested_nth.used = 0;
7565   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7566   __kmp_nested_proc_bind.bind_types = NULL;
7567   __kmp_nested_proc_bind.size = 0;
7568   __kmp_nested_proc_bind.used = 0;
7569 #if OMP_50_ENABLED
7570   if (__kmp_affinity_format) {
7571     KMP_INTERNAL_FREE(__kmp_affinity_format);
7572     __kmp_affinity_format = NULL;
7573   }
7574 #endif
7575 
7576   __kmp_i18n_catclose();
7577 
7578 #if KMP_USE_HIER_SCHED
7579   __kmp_hier_scheds.deallocate();
7580 #endif
7581 
7582 #if KMP_STATS_ENABLED
7583   __kmp_stats_fini();
7584 #endif
7585 
7586   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7587 }
7588 
7589 /* ------------------------------------------------------------------------ */
7590 
7591 int __kmp_ignore_mppbeg(void) {
7592   char *env;
7593 
7594   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7595     if (__kmp_str_match_false(env))
7596       return FALSE;
7597   }
7598   // By default __kmpc_begin() is no-op.
7599   return TRUE;
7600 }
7601 
7602 int __kmp_ignore_mppend(void) {
7603   char *env;
7604 
7605   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7606     if (__kmp_str_match_false(env))
7607       return FALSE;
7608   }
7609   // By default __kmpc_end() is no-op.
7610   return TRUE;
7611 }
7612 
7613 void __kmp_internal_begin(void) {
7614   int gtid;
7615   kmp_root_t *root;
7616 
7617   /* this is a very important step as it will register new sibling threads
7618      and assign these new uber threads a new gtid */
7619   gtid = __kmp_entry_gtid();
7620   root = __kmp_threads[gtid]->th.th_root;
7621   KMP_ASSERT(KMP_UBER_GTID(gtid));
7622 
7623   if (root->r.r_begin)
7624     return;
7625   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7626   if (root->r.r_begin) {
7627     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7628     return;
7629   }
7630 
7631   root->r.r_begin = TRUE;
7632 
7633   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7634 }
7635 
7636 /* ------------------------------------------------------------------------ */
7637 
7638 void __kmp_user_set_library(enum library_type arg) {
7639   int gtid;
7640   kmp_root_t *root;
7641   kmp_info_t *thread;
7642 
7643   /* first, make sure we are initialized so we can get our gtid */
7644 
7645   gtid = __kmp_entry_gtid();
7646   thread = __kmp_threads[gtid];
7647 
7648   root = thread->th.th_root;
7649 
7650   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7651                 library_serial));
7652   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7653                                   thread */
7654     KMP_WARNING(SetLibraryIncorrectCall);
7655     return;
7656   }
7657 
7658   switch (arg) {
7659   case library_serial:
7660     thread->th.th_set_nproc = 0;
7661     set__nproc(thread, 1);
7662     break;
7663   case library_turnaround:
7664     thread->th.th_set_nproc = 0;
7665     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7666                                            : __kmp_dflt_team_nth_ub);
7667     break;
7668   case library_throughput:
7669     thread->th.th_set_nproc = 0;
7670     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7671                                            : __kmp_dflt_team_nth_ub);
7672     break;
7673   default:
7674     KMP_FATAL(UnknownLibraryType, arg);
7675   }
7676 
7677   __kmp_aux_set_library(arg);
7678 }
7679 
7680 void __kmp_aux_set_stacksize(size_t arg) {
7681   if (!__kmp_init_serial)
7682     __kmp_serial_initialize();
7683 
7684 #if KMP_OS_DARWIN
7685   if (arg & (0x1000 - 1)) {
7686     arg &= ~(0x1000 - 1);
7687     if (arg + 0x1000) /* check for overflow if we round up */
7688       arg += 0x1000;
7689   }
7690 #endif
7691   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7692 
7693   /* only change the default stacksize before the first parallel region */
7694   if (!TCR_4(__kmp_init_parallel)) {
7695     size_t value = arg; /* argument is in bytes */
7696 
7697     if (value < __kmp_sys_min_stksize)
7698       value = __kmp_sys_min_stksize;
7699     else if (value > KMP_MAX_STKSIZE)
7700       value = KMP_MAX_STKSIZE;
7701 
7702     __kmp_stksize = value;
7703 
7704     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7705   }
7706 
7707   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7708 }
7709 
7710 /* set the behaviour of the runtime library */
7711 /* TODO this can cause some odd behaviour with sibling parallelism... */
7712 void __kmp_aux_set_library(enum library_type arg) {
7713   __kmp_library = arg;
7714 
7715   switch (__kmp_library) {
7716   case library_serial: {
7717     KMP_INFORM(LibraryIsSerial);
7718   } break;
7719   case library_turnaround:
7720     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7721       __kmp_use_yield = 2; // only yield when oversubscribed
7722     break;
7723   case library_throughput:
7724     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7725       __kmp_dflt_blocktime = 200;
7726     break;
7727   default:
7728     KMP_FATAL(UnknownLibraryType, arg);
7729   }
7730 }
7731 
7732 /* Getting team information common for all team API */
7733 // Returns NULL if not in teams construct
7734 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7735   kmp_info_t *thr = __kmp_entry_thread();
7736   teams_serialized = 0;
7737   if (thr->th.th_teams_microtask) {
7738     kmp_team_t *team = thr->th.th_team;
7739     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7740     int ii = team->t.t_level;
7741     teams_serialized = team->t.t_serialized;
7742     int level = tlevel + 1;
7743     KMP_DEBUG_ASSERT(ii >= tlevel);
7744     while (ii > level) {
7745       for (teams_serialized = team->t.t_serialized;
7746            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7747       }
7748       if (team->t.t_serialized && (!teams_serialized)) {
7749         team = team->t.t_parent;
7750         continue;
7751       }
7752       if (ii > level) {
7753         team = team->t.t_parent;
7754         ii--;
7755       }
7756     }
7757     return team;
7758   }
7759   return NULL;
7760 }
7761 
7762 int __kmp_aux_get_team_num() {
7763   int serialized;
7764   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7765   if (team) {
7766     if (serialized > 1) {
7767       return 0; // teams region is serialized ( 1 team of 1 thread ).
7768     } else {
7769       return team->t.t_master_tid;
7770     }
7771   }
7772   return 0;
7773 }
7774 
7775 int __kmp_aux_get_num_teams() {
7776   int serialized;
7777   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7778   if (team) {
7779     if (serialized > 1) {
7780       return 1;
7781     } else {
7782       return team->t.t_parent->t.t_nproc;
7783     }
7784   }
7785   return 1;
7786 }
7787 
7788 /* ------------------------------------------------------------------------ */
7789 
7790 #if OMP_50_ENABLED
7791 /*
7792  * Affinity Format Parser
7793  *
7794  * Field is in form of: %[[[0].]size]type
7795  * % and type are required (%% means print a literal '%')
7796  * type is either single char or long name surrounded by {},
7797  * e.g., N or {num_threads}
7798  * 0 => leading zeros
7799  * . => right justified when size is specified
7800  * by default output is left justified
7801  * size is the *minimum* field length
7802  * All other characters are printed as is
7803  *
7804  * Available field types:
7805  * L {thread_level}      - omp_get_level()
7806  * n {thread_num}        - omp_get_thread_num()
7807  * h {host}              - name of host machine
7808  * P {process_id}        - process id (integer)
7809  * T {thread_identifier} - native thread identifier (integer)
7810  * N {num_threads}       - omp_get_num_threads()
7811  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7812  * a {thread_affinity}   - comma separated list of integers or integer ranges
7813  *                         (values of affinity mask)
7814  *
7815  * Implementation-specific field types can be added
7816  * If a type is unknown, print "undefined"
7817 */
7818 
7819 // Structure holding the short name, long name, and corresponding data type
7820 // for snprintf.  A table of these will represent the entire valid keyword
7821 // field types.
7822 typedef struct kmp_affinity_format_field_t {
7823   char short_name; // from spec e.g., L -> thread level
7824   const char *long_name; // from spec thread_level -> thread level
7825   char field_format; // data type for snprintf (typically 'd' or 's'
7826   // for integer or string)
7827 } kmp_affinity_format_field_t;
7828 
7829 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7830 #if KMP_AFFINITY_SUPPORTED
7831     {'A', "thread_affinity", 's'},
7832 #endif
7833     {'t', "team_num", 'd'},
7834     {'T', "num_teams", 'd'},
7835     {'L', "nesting_level", 'd'},
7836     {'n', "thread_num", 'd'},
7837     {'N', "num_threads", 'd'},
7838     {'a', "ancestor_tnum", 'd'},
7839     {'H', "host", 's'},
7840     {'P', "process_id", 'd'},
7841     {'i', "native_thread_id", 'd'}};
7842 
7843 // Return the number of characters it takes to hold field
7844 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7845                                             const char **ptr,
7846                                             kmp_str_buf_t *field_buffer) {
7847   int rc, format_index, field_value;
7848   const char *width_left, *width_right;
7849   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7850   static const int FORMAT_SIZE = 20;
7851   char format[FORMAT_SIZE] = {0};
7852   char absolute_short_name = 0;
7853 
7854   KMP_DEBUG_ASSERT(gtid >= 0);
7855   KMP_DEBUG_ASSERT(th);
7856   KMP_DEBUG_ASSERT(**ptr == '%');
7857   KMP_DEBUG_ASSERT(field_buffer);
7858 
7859   __kmp_str_buf_clear(field_buffer);
7860 
7861   // Skip the initial %
7862   (*ptr)++;
7863 
7864   // Check for %% first
7865   if (**ptr == '%') {
7866     __kmp_str_buf_cat(field_buffer, "%", 1);
7867     (*ptr)++; // skip over the second %
7868     return 1;
7869   }
7870 
7871   // Parse field modifiers if they are present
7872   pad_zeros = false;
7873   if (**ptr == '0') {
7874     pad_zeros = true;
7875     (*ptr)++; // skip over 0
7876   }
7877   right_justify = false;
7878   if (**ptr == '.') {
7879     right_justify = true;
7880     (*ptr)++; // skip over .
7881   }
7882   // Parse width of field: [width_left, width_right)
7883   width_left = width_right = NULL;
7884   if (**ptr >= '0' && **ptr <= '9') {
7885     width_left = *ptr;
7886     SKIP_DIGITS(*ptr);
7887     width_right = *ptr;
7888   }
7889 
7890   // Create the format for KMP_SNPRINTF based on flags parsed above
7891   format_index = 0;
7892   format[format_index++] = '%';
7893   if (!right_justify)
7894     format[format_index++] = '-';
7895   if (pad_zeros)
7896     format[format_index++] = '0';
7897   if (width_left && width_right) {
7898     int i = 0;
7899     // Only allow 8 digit number widths.
7900     // This also prevents overflowing format variable
7901     while (i < 8 && width_left < width_right) {
7902       format[format_index++] = *width_left;
7903       width_left++;
7904       i++;
7905     }
7906   }
7907 
7908   // Parse a name (long or short)
7909   // Canonicalize the name into absolute_short_name
7910   found_valid_name = false;
7911   parse_long_name = (**ptr == '{');
7912   if (parse_long_name)
7913     (*ptr)++; // skip initial left brace
7914   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7915                              sizeof(__kmp_affinity_format_table[0]);
7916        ++i) {
7917     char short_name = __kmp_affinity_format_table[i].short_name;
7918     const char *long_name = __kmp_affinity_format_table[i].long_name;
7919     char field_format = __kmp_affinity_format_table[i].field_format;
7920     if (parse_long_name) {
7921       int length = KMP_STRLEN(long_name);
7922       if (strncmp(*ptr, long_name, length) == 0) {
7923         found_valid_name = true;
7924         (*ptr) += length; // skip the long name
7925       }
7926     } else if (**ptr == short_name) {
7927       found_valid_name = true;
7928       (*ptr)++; // skip the short name
7929     }
7930     if (found_valid_name) {
7931       format[format_index++] = field_format;
7932       format[format_index++] = '\0';
7933       absolute_short_name = short_name;
7934       break;
7935     }
7936   }
7937   if (parse_long_name) {
7938     if (**ptr != '}') {
7939       absolute_short_name = 0;
7940     } else {
7941       (*ptr)++; // skip over the right brace
7942     }
7943   }
7944 
7945   // Attempt to fill the buffer with the requested
7946   // value using snprintf within __kmp_str_buf_print()
7947   switch (absolute_short_name) {
7948   case 't':
7949     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7950     break;
7951   case 'T':
7952     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7953     break;
7954   case 'L':
7955     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7956     break;
7957   case 'n':
7958     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7959     break;
7960   case 'H': {
7961     static const int BUFFER_SIZE = 256;
7962     char buf[BUFFER_SIZE];
7963     __kmp_expand_host_name(buf, BUFFER_SIZE);
7964     rc = __kmp_str_buf_print(field_buffer, format, buf);
7965   } break;
7966   case 'P':
7967     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7968     break;
7969   case 'i':
7970     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7971     break;
7972   case 'N':
7973     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7974     break;
7975   case 'a':
7976     field_value =
7977         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7978     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7979     break;
7980 #if KMP_AFFINITY_SUPPORTED
7981   case 'A': {
7982     kmp_str_buf_t buf;
7983     __kmp_str_buf_init(&buf);
7984     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7985     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7986     __kmp_str_buf_free(&buf);
7987   } break;
7988 #endif
7989   default:
7990     // According to spec, If an implementation does not have info for field
7991     // type, then "undefined" is printed
7992     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7993     // Skip the field
7994     if (parse_long_name) {
7995       SKIP_TOKEN(*ptr);
7996       if (**ptr == '}')
7997         (*ptr)++;
7998     } else {
7999       (*ptr)++;
8000     }
8001   }
8002 
8003   KMP_ASSERT(format_index <= FORMAT_SIZE);
8004   return rc;
8005 }
8006 
8007 /*
8008  * Return number of characters needed to hold the affinity string
8009  * (not including null byte character)
8010  * The resultant string is printed to buffer, which the caller can then
8011  * handle afterwards
8012 */
8013 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8014                                   kmp_str_buf_t *buffer) {
8015   const char *parse_ptr;
8016   size_t retval;
8017   const kmp_info_t *th;
8018   kmp_str_buf_t field;
8019 
8020   KMP_DEBUG_ASSERT(buffer);
8021   KMP_DEBUG_ASSERT(gtid >= 0);
8022 
8023   __kmp_str_buf_init(&field);
8024   __kmp_str_buf_clear(buffer);
8025 
8026   th = __kmp_threads[gtid];
8027   retval = 0;
8028 
8029   // If format is NULL or zero-length string, then we use
8030   // affinity-format-var ICV
8031   parse_ptr = format;
8032   if (parse_ptr == NULL || *parse_ptr == '\0') {
8033     parse_ptr = __kmp_affinity_format;
8034   }
8035   KMP_DEBUG_ASSERT(parse_ptr);
8036 
8037   while (*parse_ptr != '\0') {
8038     // Parse a field
8039     if (*parse_ptr == '%') {
8040       // Put field in the buffer
8041       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8042       __kmp_str_buf_catbuf(buffer, &field);
8043       retval += rc;
8044     } else {
8045       // Put literal character in buffer
8046       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8047       retval++;
8048       parse_ptr++;
8049     }
8050   }
8051   __kmp_str_buf_free(&field);
8052   return retval;
8053 }
8054 
8055 // Displays the affinity string to stdout
8056 void __kmp_aux_display_affinity(int gtid, const char *format) {
8057   kmp_str_buf_t buf;
8058   __kmp_str_buf_init(&buf);
8059   __kmp_aux_capture_affinity(gtid, format, &buf);
8060   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8061   __kmp_str_buf_free(&buf);
8062 }
8063 #endif // OMP_50_ENABLED
8064 
8065 /* ------------------------------------------------------------------------ */
8066 
8067 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8068   int blocktime = arg; /* argument is in milliseconds */
8069 #if KMP_USE_MONITOR
8070   int bt_intervals;
8071 #endif
8072   int bt_set;
8073 
8074   __kmp_save_internal_controls(thread);
8075 
8076   /* Normalize and set blocktime for the teams */
8077   if (blocktime < KMP_MIN_BLOCKTIME)
8078     blocktime = KMP_MIN_BLOCKTIME;
8079   else if (blocktime > KMP_MAX_BLOCKTIME)
8080     blocktime = KMP_MAX_BLOCKTIME;
8081 
8082   set__blocktime_team(thread->th.th_team, tid, blocktime);
8083   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8084 
8085 #if KMP_USE_MONITOR
8086   /* Calculate and set blocktime intervals for the teams */
8087   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8088 
8089   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8090   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8091 #endif
8092 
8093   /* Set whether blocktime has been set to "TRUE" */
8094   bt_set = TRUE;
8095 
8096   set__bt_set_team(thread->th.th_team, tid, bt_set);
8097   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8098 #if KMP_USE_MONITOR
8099   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8100                 "bt_intervals=%d, monitor_updates=%d\n",
8101                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8102                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8103                 __kmp_monitor_wakeups));
8104 #else
8105   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8106                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8107                 thread->th.th_team->t.t_id, tid, blocktime));
8108 #endif
8109 }
8110 
8111 void __kmp_aux_set_defaults(char const *str, int len) {
8112   if (!__kmp_init_serial) {
8113     __kmp_serial_initialize();
8114   }
8115   __kmp_env_initialize(str);
8116 
8117   if (__kmp_settings
8118 #if OMP_40_ENABLED
8119       || __kmp_display_env || __kmp_display_env_verbose
8120 #endif // OMP_40_ENABLED
8121       ) {
8122     __kmp_env_print();
8123   }
8124 } // __kmp_aux_set_defaults
8125 
8126 /* ------------------------------------------------------------------------ */
8127 /* internal fast reduction routines */
8128 
8129 PACKED_REDUCTION_METHOD_T
8130 __kmp_determine_reduction_method(
8131     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8132     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8133     kmp_critical_name *lck) {
8134 
8135   // Default reduction method: critical construct ( lck != NULL, like in current
8136   // PAROPT )
8137   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8138   // can be selected by RTL
8139   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8140   // can be selected by RTL
8141   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8142   // among generated by PAROPT.
8143 
8144   PACKED_REDUCTION_METHOD_T retval;
8145 
8146   int team_size;
8147 
8148   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8149   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8150 
8151 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8152   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8153 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8154 
8155   retval = critical_reduce_block;
8156 
8157   // another choice of getting a team size (with 1 dynamic deference) is slower
8158   team_size = __kmp_get_team_num_threads(global_tid);
8159   if (team_size == 1) {
8160 
8161     retval = empty_reduce_block;
8162 
8163   } else {
8164 
8165     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8166 
8167 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8168 
8169 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8170     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8171 
8172     int teamsize_cutoff = 4;
8173 
8174 #if KMP_MIC_SUPPORTED
8175     if (__kmp_mic_type != non_mic) {
8176       teamsize_cutoff = 8;
8177     }
8178 #endif
8179     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8180     if (tree_available) {
8181       if (team_size <= teamsize_cutoff) {
8182         if (atomic_available) {
8183           retval = atomic_reduce_block;
8184         }
8185       } else {
8186         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8187       }
8188     } else if (atomic_available) {
8189       retval = atomic_reduce_block;
8190     }
8191 #else
8192 #error "Unknown or unsupported OS"
8193 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8194        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8195 
8196 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8197 
8198 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
8199 
8200     // basic tuning
8201 
8202     if (atomic_available) {
8203       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8204         retval = atomic_reduce_block;
8205       }
8206     } // otherwise: use critical section
8207 
8208 #elif KMP_OS_DARWIN
8209 
8210     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8211     if (atomic_available && (num_vars <= 3)) {
8212       retval = atomic_reduce_block;
8213     } else if (tree_available) {
8214       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8215           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8216         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8217       }
8218     } // otherwise: use critical section
8219 
8220 #else
8221 #error "Unknown or unsupported OS"
8222 #endif
8223 
8224 #else
8225 #error "Unknown or unsupported architecture"
8226 #endif
8227   }
8228 
8229   // KMP_FORCE_REDUCTION
8230 
8231   // If the team is serialized (team_size == 1), ignore the forced reduction
8232   // method and stay with the unsynchronized method (empty_reduce_block)
8233   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8234       team_size != 1) {
8235 
8236     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8237 
8238     int atomic_available, tree_available;
8239 
8240     switch ((forced_retval = __kmp_force_reduction_method)) {
8241     case critical_reduce_block:
8242       KMP_ASSERT(lck); // lck should be != 0
8243       break;
8244 
8245     case atomic_reduce_block:
8246       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8247       if (!atomic_available) {
8248         KMP_WARNING(RedMethodNotSupported, "atomic");
8249         forced_retval = critical_reduce_block;
8250       }
8251       break;
8252 
8253     case tree_reduce_block:
8254       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8255       if (!tree_available) {
8256         KMP_WARNING(RedMethodNotSupported, "tree");
8257         forced_retval = critical_reduce_block;
8258       } else {
8259 #if KMP_FAST_REDUCTION_BARRIER
8260         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8261 #endif
8262       }
8263       break;
8264 
8265     default:
8266       KMP_ASSERT(0); // "unsupported method specified"
8267     }
8268 
8269     retval = forced_retval;
8270   }
8271 
8272   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8273 
8274 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8275 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8276 
8277   return (retval);
8278 }
8279 
8280 // this function is for testing set/get/determine reduce method
8281 kmp_int32 __kmp_get_reduce_method(void) {
8282   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8283 }
8284 
8285 #if OMP_50_ENABLED
8286 
8287 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8288 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8289 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8290 
8291 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8292 // OpenMP is used subsequently.
8293 void __kmp_hard_pause() {
8294   __kmp_pause_status = kmp_hard_paused;
8295   __kmp_internal_end_thread(-1);
8296 }
8297 
8298 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8299 void __kmp_resume_if_soft_paused() {
8300   if (__kmp_pause_status == kmp_soft_paused) {
8301     __kmp_pause_status = kmp_not_paused;
8302 
8303     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8304       kmp_info_t *thread = __kmp_threads[gtid];
8305       if (thread) { // Wake it if sleeping
8306         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8307         if (fl.is_sleeping())
8308           fl.resume(gtid);
8309         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8310           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8311         } else { // thread holds the lock and may sleep soon
8312           do { // until either the thread sleeps, or we can get the lock
8313             if (fl.is_sleeping()) {
8314               fl.resume(gtid);
8315               break;
8316             } else if (__kmp_try_suspend_mx(thread)) {
8317               __kmp_unlock_suspend_mx(thread);
8318               break;
8319             }
8320           } while (1);
8321         }
8322       }
8323     }
8324   }
8325 }
8326 
8327 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8328 // TODO: add warning messages
8329 int __kmp_pause_resource(kmp_pause_status_t level) {
8330   if (level == kmp_not_paused) { // requesting resume
8331     if (__kmp_pause_status == kmp_not_paused) {
8332       // error message about runtime not being paused, so can't resume
8333       return 1;
8334     } else {
8335       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8336                        __kmp_pause_status == kmp_hard_paused);
8337       __kmp_pause_status = kmp_not_paused;
8338       return 0;
8339     }
8340   } else if (level == kmp_soft_paused) { // requesting soft pause
8341     if (__kmp_pause_status != kmp_not_paused) {
8342       // error message about already being paused
8343       return 1;
8344     } else {
8345       __kmp_soft_pause();
8346       return 0;
8347     }
8348   } else if (level == kmp_hard_paused) { // requesting hard pause
8349     if (__kmp_pause_status != kmp_not_paused) {
8350       // error message about already being paused
8351       return 1;
8352     } else {
8353       __kmp_hard_pause();
8354       return 0;
8355     }
8356   } else {
8357     // error message about invalid level
8358     return 1;
8359   }
8360 }
8361 
8362 #endif // OMP_50_ENABLED
8363