1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46     KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
50 #if OMP_50_ENABLED
51                                                         "5.0 (201611)";
52 #elif OMP_45_ENABLED
53                                                         "4.5 (201511)";
54 #elif OMP_40_ENABLED
55                                                         "4.0 (201307)";
56 #else
57                                                         "3.1 (201107)";
58 #endif
59 
60 #ifdef KMP_DEBUG
61 char const __kmp_version_lock[] =
62     KMP_VERSION_PREFIX "lock type: run time selectable";
63 #endif /* KMP_DEBUG */
64 
65 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
66 
67 /* ------------------------------------------------------------------------ */
68 
69 #if KMP_USE_MONITOR
70 kmp_info_t __kmp_monitor;
71 #endif
72 
73 /* Forward declarations */
74 
75 void __kmp_cleanup(void);
76 
77 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
78                                   int gtid);
79 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
80                                   kmp_internal_control_t *new_icvs,
81                                   ident_t *loc);
82 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
83 static void __kmp_partition_places(kmp_team_t *team,
84                                    int update_master_only = 0);
85 #endif
86 static void __kmp_do_serial_initialize(void);
87 void __kmp_fork_barrier(int gtid, int tid);
88 void __kmp_join_barrier(int gtid);
89 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
90                           kmp_internal_control_t *new_icvs, ident_t *loc);
91 
92 #ifdef USE_LOAD_BALANCE
93 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
94 #endif
95 
96 static int __kmp_expand_threads(int nNeed);
97 #if KMP_OS_WINDOWS
98 static int __kmp_unregister_root_other_thread(int gtid);
99 #endif
100 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108   int i;
109   kmp_info_t **other_threads;
110   size_t stack_data;
111   char *stack_addr;
112   size_t stack_size;
113   char *stack_base;
114 
115   KA_TRACE(
116       1000,
117       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
118        __kmp_nth, __kmp_all_nth));
119 
120   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123      __kmp_init_gtid for this to work. */
124 
125   if (!TCR_4(__kmp_init_gtid))
126     return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129   if (TCR_4(__kmp_gtid_mode) >= 3) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131     return __kmp_gtid;
132   }
133 #endif
134   if (TCR_4(__kmp_gtid_mode) >= 2) {
135     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136     return __kmp_gtid_get_specific();
137   }
138   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140   stack_addr = (char *)&stack_data;
141   other_threads = __kmp_threads;
142 
143   /* ATT: The code below is a source of potential bugs due to unsynchronized
144      access to __kmp_threads array. For example:
145      1. Current thread loads other_threads[i] to thr and checks it, it is
146         non-NULL.
147      2. Current thread is suspended by OS.
148      3. Another thread unregisters and finishes (debug versions of free()
149         may fill memory with something like 0xEF).
150      4. Current thread is resumed.
151      5. Current thread reads junk from *thr.
152      TODO: Fix it.  --ln  */
153 
154   for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157     if (!thr)
158       continue;
159 
160     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163     /* stack grows down -- search through all of the active threads */
164 
165     if (stack_addr <= stack_base) {
166       size_t stack_diff = stack_base - stack_addr;
167 
168       if (stack_diff <= stack_size) {
169         /* The only way we can be closer than the allocated */
170         /* stack size is if we are running on this thread. */
171         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172         return i;
173       }
174     }
175   }
176 
177   /* get specific to try and determine our gtid */
178   KA_TRACE(1000,
179            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180             "thread, using TLS\n"));
181   i = __kmp_gtid_get_specific();
182 
183   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
184 
185   /* if we havn't been assigned a gtid, then return code */
186   if (i < 0)
187     return i;
188 
189   /* dynamically updated stack window for uber threads to avoid get_specific
190      call */
191   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192     KMP_FATAL(StackOverflow, i);
193   }
194 
195   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196   if (stack_addr > stack_base) {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200                 stack_base);
201   } else {
202     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203             stack_base - stack_addr);
204   }
205 
206   /* Reprint stack bounds for ubermaster since they have been refined */
207   if (__kmp_storage_map) {
208     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211                                  other_threads[i]->th.th_info.ds.ds_stacksize,
212                                  "th_%d stack (refinement)", i);
213   }
214   return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218   int gtid;
219 
220   if (!__kmp_init_serial) {
221     gtid = KMP_GTID_DNE;
222   } else
223 #ifdef KMP_TDATA_GTID
224       if (TCR_4(__kmp_gtid_mode) >= 3) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226     gtid = __kmp_gtid;
227   } else
228 #endif
229       if (TCR_4(__kmp_gtid_mode) >= 2) {
230     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231     gtid = __kmp_gtid_get_specific();
232   } else {
233     KA_TRACE(1000,
234              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235     gtid = __kmp_get_global_thread_id();
236   }
237 
238   /* we must be a new uber master sibling thread */
239   if (gtid == KMP_GTID_DNE) {
240     KA_TRACE(10,
241              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242               "Registering a new gtid.\n"));
243     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244     if (!__kmp_init_serial) {
245       __kmp_do_serial_initialize();
246       gtid = __kmp_gtid_get_specific();
247     } else {
248       gtid = __kmp_register_root(FALSE);
249     }
250     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252   }
253 
254   KMP_DEBUG_ASSERT(gtid >= 0);
255 
256   return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261   int f;
262   char *stack_beg = NULL;
263   char *stack_end = NULL;
264   int gtid;
265 
266   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267   if (__kmp_storage_map) {
268     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271     gtid = __kmp_gtid_from_thread(th);
272 
273     if (gtid == KMP_GTID_MONITOR) {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%s stack (%s)", "mon",
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     } else {
279       __kmp_print_storage_map_gtid(
280           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281           "th_%d stack (%s)", gtid,
282           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283     }
284   }
285 
286   /* No point in checking ubermaster threads since they use refinement and
287    * cannot overlap */
288   gtid = __kmp_gtid_from_thread(th);
289   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290     KA_TRACE(10,
291              ("__kmp_check_stack_overlap: performing extensive checking\n"));
292     if (stack_beg == NULL) {
293       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295     }
296 
297     for (f = 0; f < __kmp_threads_capacity; f++) {
298       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300       if (f_th && f_th != th) {
301         char *other_stack_end =
302             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303         char *other_stack_beg =
304             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308           /* Print the other stack values before the abort */
309           if (__kmp_storage_map)
310             __kmp_print_storage_map_gtid(
311                 -1, other_stack_beg, other_stack_end,
312                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316                       __kmp_msg_null);
317         }
318       }
319     }
320   }
321   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327   static int done = FALSE;
328 
329   while (!done) {
330     KMP_YIELD(1);
331   }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337                                   char const *format, ...) {
338   char buffer[MAX_MESSAGE];
339   va_list ap;
340 
341   va_start(ap, format);
342   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343                p2, (unsigned long)size, format);
344   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345   __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347   int node;
348   if (gtid >= 0) {
349     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350       if (__kmp_storage_map_verbose) {
351         node = __kmp_get_host_node(p1);
352         if (node < 0) /* doesn't work, so don't try this next time */
353           __kmp_storage_map_verbose = FALSE;
354         else {
355           char *last;
356           int lastNode;
357           int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359           const int page_size = KMP_GET_PAGE_SIZE();
360 
361           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363           if (localProc >= 0)
364             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
365                                  localProc >> 1);
366           else
367             __kmp_printf_no_lock("  GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369           /* The more elaborate format is disabled for now because of the prctl
370            * hanging bug. */
371           do {
372             last = p1;
373             lastNode = node;
374             /* This loop collates adjacent pages with the same host node. */
375             do {
376               (char *)p1 += page_size;
377             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
379                                  lastNode);
380           } while (p1 <= p2);
381 #else
382           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
383                                (char *)p1 + (page_size - 1),
384                                __kmp_get_host_node(p1));
385           if (p1 < p2) {
386             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
387                                  (char *)p2 + (page_size - 1),
388                                  __kmp_get_host_node(p2));
389           }
390 #endif
391         }
392       }
393     } else
394       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
395   }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401   char buffer[MAX_MESSAGE];
402   va_list ap;
403 
404   if (__kmp_generate_warnings == kmp_warnings_off) {
405     return;
406   }
407 
408   va_start(ap, format);
409 
410   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412   __kmp_vprintf(kmp_err, buffer, ap);
413   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415   va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419   // Later threads may stall here, but that's ok because abort() will kill them.
420   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422   if (__kmp_debug_buf) {
423     __kmp_dump_debug_buffer();
424   }
425 
426   if (KMP_OS_WINDOWS) {
427     // Let other threads know of abnormal termination and prevent deadlock
428     // if abort happened during library initialization or shutdown
429     __kmp_global.g.g_abort = SIGABRT;
430 
431     /* On Windows* OS by default abort() causes pop-up error box, which stalls
432        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433        boxes. _set_abort_behavior() works well, but this function is not
434        available in VS7 (this is not problem for DLL, but it is a problem for
435        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436        help, at least in some versions of MS C RTL.
437 
438        It seems following sequence is the only way to simulate abort() and
439        avoid pop-up error box. */
440     raise(SIGABRT);
441     _exit(3); // Just in case, if signal ignored, exit anyway.
442   } else {
443     abort();
444   }
445 
446   __kmp_infinite_loop();
447   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
448 
449 } // __kmp_abort_process
450 
451 void __kmp_abort_thread(void) {
452   // TODO: Eliminate g_abort global variable and this function.
453   // In case of abort just call abort(), it will kill all the threads.
454   __kmp_infinite_loop();
455 } // __kmp_abort_thread
456 
457 /* Print out the storage map for the major kmp_info_t thread data structures
458    that are allocated together. */
459 
460 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
461   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
462                                gtid);
463 
464   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
465                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
466 
467   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
468                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
469 
470   __kmp_print_storage_map_gtid(
471       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
472       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
475                                &thr->th.th_bar[bs_plain_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
477                                gtid);
478 
479   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
481                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
482                                gtid);
483 
484 #if KMP_FAST_REDUCTION_BARRIER
485   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
486                                &thr->th.th_bar[bs_reduction_barrier + 1],
487                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
488                                gtid);
489 #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /* Print out the storage map for the major kmp_team_t team data structures
493    that are allocated together. */
494 
495 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
496                                          int team_id, int num_thr) {
497   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
498   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
499                                header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
502                                &team->t.t_bar[bs_last_barrier],
503                                sizeof(kmp_balign_team_t) * bs_last_barrier,
504                                "%s_%d.t_bar", header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
507                                &team->t.t_bar[bs_plain_barrier + 1],
508                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
509                                header, team_id);
510 
511   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
512                                &team->t.t_bar[bs_forkjoin_barrier + 1],
513                                sizeof(kmp_balign_team_t),
514                                "%s_%d.t_bar[forkjoin]", header, team_id);
515 
516 #if KMP_FAST_REDUCTION_BARRIER
517   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
518                                &team->t.t_bar[bs_reduction_barrier + 1],
519                                sizeof(kmp_balign_team_t),
520                                "%s_%d.t_bar[reduction]", header, team_id);
521 #endif // KMP_FAST_REDUCTION_BARRIER
522 
523   __kmp_print_storage_map_gtid(
524       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
525       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
526 
527   __kmp_print_storage_map_gtid(
528       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
529       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
532                                &team->t.t_disp_buffer[num_disp_buff],
533                                sizeof(dispatch_shared_info_t) * num_disp_buff,
534                                "%s_%d.t_disp_buffer", header, team_id);
535 
536   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
537                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
538                                team_id);
539 }
540 
541 static void __kmp_init_allocator() {
542 #if OMP_50_ENABLED
543   __kmp_init_memkind();
544 #endif
545 }
546 static void __kmp_fini_allocator() {
547 #if OMP_50_ENABLED
548   __kmp_fini_memkind();
549 #endif
550 }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
558   // TODO: Change to __kmp_break_bootstrap_lock().
559   __kmp_init_bootstrap_lock(lck); // make the lock released
560 }
561 
562 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
563   int i;
564   int thread_count;
565 
566   // PROCESS_DETACH is expected to be called by a thread that executes
567   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
568   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
569   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
570   // threads can be still alive here, although being about to be terminated. The
571   // threads in the array with ds_thread==0 are most suspicious. Actually, it
572   // can be not safe to access the __kmp_threads[].
573 
574   // TODO: does it make sense to check __kmp_roots[] ?
575 
576   // Let's check that there are no other alive threads registered with the OMP
577   // lib.
578   while (1) {
579     thread_count = 0;
580     for (i = 0; i < __kmp_threads_capacity; ++i) {
581       if (!__kmp_threads)
582         continue;
583       kmp_info_t *th = __kmp_threads[i];
584       if (th == NULL)
585         continue;
586       int gtid = th->th.th_info.ds.ds_gtid;
587       if (gtid == gtid_req)
588         continue;
589       if (gtid < 0)
590         continue;
591       DWORD exit_val;
592       int alive = __kmp_is_thread_alive(th, &exit_val);
593       if (alive) {
594         ++thread_count;
595       }
596     }
597     if (thread_count == 0)
598       break; // success
599   }
600 
601   // Assume that I'm alone. Now it might be safe to check and reset locks.
602   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
603   __kmp_reset_lock(&__kmp_forkjoin_lock);
604 #ifdef KMP_DEBUG
605   __kmp_reset_lock(&__kmp_stdio_lock);
606 #endif // KMP_DEBUG
607 }
608 
609 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
610   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
611 
612   switch (fdwReason) {
613 
614   case DLL_PROCESS_ATTACH:
615     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
616 
617     return TRUE;
618 
619   case DLL_PROCESS_DETACH:
620     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
621 
622     if (lpReserved != NULL) {
623       // lpReserved is used for telling the difference:
624       //   lpReserved == NULL when FreeLibrary() was called,
625       //   lpReserved != NULL when the process terminates.
626       // When FreeLibrary() is called, worker threads remain alive. So they will
627       // release the forkjoin lock by themselves. When the process terminates,
628       // worker threads disappear triggering the problem of unreleased forkjoin
629       // lock as described below.
630 
631       // A worker thread can take the forkjoin lock. The problem comes up if
632       // that worker thread becomes dead before it releases the forkjoin lock.
633       // The forkjoin lock remains taken, while the thread executing
634       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
635       // to take the forkjoin lock and will always fail, so that the application
636       // will never finish [normally]. This scenario is possible if
637       // __kmpc_end() has not been executed. It looks like it's not a corner
638       // case, but common cases:
639       // - the main function was compiled by an alternative compiler;
640       // - the main function was compiled by icl but without /Qopenmp
641       //   (application with plugins);
642       // - application terminates by calling C exit(), Fortran CALL EXIT() or
643       //   Fortran STOP.
644       // - alive foreign thread prevented __kmpc_end from doing cleanup.
645       //
646       // This is a hack to work around the problem.
647       // TODO: !!! figure out something better.
648       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
649     }
650 
651     __kmp_internal_end_library(__kmp_gtid_get_specific());
652 
653     return TRUE;
654 
655   case DLL_THREAD_ATTACH:
656     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
657 
658     /* if we want to register new siblings all the time here call
659      * __kmp_get_gtid(); */
660     return TRUE;
661 
662   case DLL_THREAD_DETACH:
663     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
664 
665     __kmp_internal_end_thread(__kmp_gtid_get_specific());
666     return TRUE;
667   }
668 
669   return TRUE;
670 }
671 
672 #endif /* KMP_OS_WINDOWS */
673 #endif /* KMP_DYNAMIC_LIB */
674 
675 /* Change the library type to "status" and return the old type */
676 /* called from within initialization routines where __kmp_initz_lock is held */
677 int __kmp_change_library(int status) {
678   int old_status;
679 
680   old_status = __kmp_yield_init &
681                1; // check whether KMP_LIBRARY=throughput (even init count)
682 
683   if (status) {
684     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
685   } else {
686     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
687   }
688 
689   return old_status; // return previous setting of whether
690   // KMP_LIBRARY=throughput
691 }
692 
693 /* __kmp_parallel_deo -- Wait until it's our turn. */
694 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
695   int gtid = *gtid_ref;
696 #ifdef BUILD_PARALLEL_ORDERED
697   kmp_team_t *team = __kmp_team_from_gtid(gtid);
698 #endif /* BUILD_PARALLEL_ORDERED */
699 
700   if (__kmp_env_consistency_check) {
701     if (__kmp_threads[gtid]->th.th_root->r.r_active)
702 #if KMP_USE_DYNAMIC_LOCK
703       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
704 #else
705       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
706 #endif
707   }
708 #ifdef BUILD_PARALLEL_ORDERED
709   if (!team->t.t_serialized) {
710     KMP_MB();
711     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
712                    KMP_EQ, NULL);
713     KMP_MB();
714   }
715 #endif /* BUILD_PARALLEL_ORDERED */
716 }
717 
718 /* __kmp_parallel_dxo -- Signal the next task. */
719 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
720   int gtid = *gtid_ref;
721 #ifdef BUILD_PARALLEL_ORDERED
722   int tid = __kmp_tid_from_gtid(gtid);
723   kmp_team_t *team = __kmp_team_from_gtid(gtid);
724 #endif /* BUILD_PARALLEL_ORDERED */
725 
726   if (__kmp_env_consistency_check) {
727     if (__kmp_threads[gtid]->th.th_root->r.r_active)
728       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
729   }
730 #ifdef BUILD_PARALLEL_ORDERED
731   if (!team->t.t_serialized) {
732     KMP_MB(); /* Flush all pending memory write invalidates.  */
733 
734     /* use the tid of the next thread in this team */
735     /* TODO replace with general release procedure */
736     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
737 
738     KMP_MB(); /* Flush all pending memory write invalidates.  */
739   }
740 #endif /* BUILD_PARALLEL_ORDERED */
741 }
742 
743 /* ------------------------------------------------------------------------ */
744 /* The BARRIER for a SINGLE process section is always explicit   */
745 
746 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
747   int status;
748   kmp_info_t *th;
749   kmp_team_t *team;
750 
751   if (!TCR_4(__kmp_init_parallel))
752     __kmp_parallel_initialize();
753 
754 #if OMP_50_ENABLED
755   __kmp_resume_if_soft_paused();
756 #endif
757 
758   th = __kmp_threads[gtid];
759   team = th->th.th_team;
760   status = 0;
761 
762   th->th.th_ident = id_ref;
763 
764   if (team->t.t_serialized) {
765     status = 1;
766   } else {
767     kmp_int32 old_this = th->th.th_local.this_construct;
768 
769     ++th->th.th_local.this_construct;
770     /* try to set team count to thread count--success means thread got the
771        single block */
772     /* TODO: Should this be acquire or release? */
773     if (team->t.t_construct == old_this) {
774       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
775                                               th->th.th_local.this_construct);
776     }
777 #if USE_ITT_BUILD
778     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
779         KMP_MASTER_GTID(gtid) &&
780 #if OMP_40_ENABLED
781         th->th.th_teams_microtask == NULL &&
782 #endif
783         team->t.t_active_level ==
784             1) { // Only report metadata by master of active team at level 1
785       __kmp_itt_metadata_single(id_ref);
786     }
787 #endif /* USE_ITT_BUILD */
788   }
789 
790   if (__kmp_env_consistency_check) {
791     if (status && push_ws) {
792       __kmp_push_workshare(gtid, ct_psingle, id_ref);
793     } else {
794       __kmp_check_workshare(gtid, ct_psingle, id_ref);
795     }
796   }
797 #if USE_ITT_BUILD
798   if (status) {
799     __kmp_itt_single_start(gtid);
800   }
801 #endif /* USE_ITT_BUILD */
802   return status;
803 }
804 
805 void __kmp_exit_single(int gtid) {
806 #if USE_ITT_BUILD
807   __kmp_itt_single_end(gtid);
808 #endif /* USE_ITT_BUILD */
809   if (__kmp_env_consistency_check)
810     __kmp_pop_workshare(gtid, ct_psingle, NULL);
811 }
812 
813 /* determine if we can go parallel or must use a serialized parallel region and
814  * how many threads we can use
815  * set_nproc is the number of threads requested for the team
816  * returns 0 if we should serialize or only use one thread,
817  * otherwise the number of threads to use
818  * The forkjoin lock is held by the caller. */
819 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
820                                  int master_tid, int set_nthreads
821 #if OMP_40_ENABLED
822                                  ,
823                                  int enter_teams
824 #endif /* OMP_40_ENABLED */
825                                  ) {
826   int capacity;
827   int new_nthreads;
828   KMP_DEBUG_ASSERT(__kmp_init_serial);
829   KMP_DEBUG_ASSERT(root && parent_team);
830 
831   // If dyn-var is set, dynamically adjust the number of desired threads,
832   // according to the method specified by dynamic_mode.
833   new_nthreads = set_nthreads;
834   if (!get__dynamic_2(parent_team, master_tid)) {
835     ;
836   }
837 #ifdef USE_LOAD_BALANCE
838   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
839     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
840     if (new_nthreads == 1) {
841       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
842                     "reservation to 1 thread\n",
843                     master_tid));
844       return 1;
845     }
846     if (new_nthreads < set_nthreads) {
847       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
848                     "reservation to %d threads\n",
849                     master_tid, new_nthreads));
850     }
851   }
852 #endif /* USE_LOAD_BALANCE */
853   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
854     new_nthreads = __kmp_avail_proc - __kmp_nth +
855                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
856     if (new_nthreads <= 1) {
857       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
858                     "reservation to 1 thread\n",
859                     master_tid));
860       return 1;
861     }
862     if (new_nthreads < set_nthreads) {
863       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
864                     "reservation to %d threads\n",
865                     master_tid, new_nthreads));
866     } else {
867       new_nthreads = set_nthreads;
868     }
869   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
870     if (set_nthreads > 2) {
871       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
872       new_nthreads = (new_nthreads % set_nthreads) + 1;
873       if (new_nthreads == 1) {
874         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
875                       "reservation to 1 thread\n",
876                       master_tid));
877         return 1;
878       }
879       if (new_nthreads < set_nthreads) {
880         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
881                       "reservation to %d threads\n",
882                       master_tid, new_nthreads));
883       }
884     }
885   } else {
886     KMP_ASSERT(0);
887   }
888 
889   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
890   if (__kmp_nth + new_nthreads -
891           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
892       __kmp_max_nth) {
893     int tl_nthreads = __kmp_max_nth - __kmp_nth +
894                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
895     if (tl_nthreads <= 0) {
896       tl_nthreads = 1;
897     }
898 
899     // If dyn-var is false, emit a 1-time warning.
900     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
901       __kmp_reserve_warn = 1;
902       __kmp_msg(kmp_ms_warning,
903                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
904                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
905     }
906     if (tl_nthreads == 1) {
907       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
908                     "reduced reservation to 1 thread\n",
909                     master_tid));
910       return 1;
911     }
912     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
913                   "reservation to %d threads\n",
914                   master_tid, tl_nthreads));
915     new_nthreads = tl_nthreads;
916   }
917 
918   // Respect OMP_THREAD_LIMIT
919   if (root->r.r_cg_nthreads + new_nthreads -
920           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
921       __kmp_cg_max_nth) {
922     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
923                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
924     if (tl_nthreads <= 0) {
925       tl_nthreads = 1;
926     }
927 
928     // If dyn-var is false, emit a 1-time warning.
929     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
930       __kmp_reserve_warn = 1;
931       __kmp_msg(kmp_ms_warning,
932                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
933                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
934     }
935     if (tl_nthreads == 1) {
936       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
937                     "reduced reservation to 1 thread\n",
938                     master_tid));
939       return 1;
940     }
941     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
942                   "reservation to %d threads\n",
943                   master_tid, tl_nthreads));
944     new_nthreads = tl_nthreads;
945   }
946 
947   // Check if the threads array is large enough, or needs expanding.
948   // See comment in __kmp_register_root() about the adjustment if
949   // __kmp_threads[0] == NULL.
950   capacity = __kmp_threads_capacity;
951   if (TCR_PTR(__kmp_threads[0]) == NULL) {
952     --capacity;
953   }
954   if (__kmp_nth + new_nthreads -
955           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
956       capacity) {
957     // Expand the threads array.
958     int slotsRequired = __kmp_nth + new_nthreads -
959                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
960                         capacity;
961     int slotsAdded = __kmp_expand_threads(slotsRequired);
962     if (slotsAdded < slotsRequired) {
963       // The threads array was not expanded enough.
964       new_nthreads -= (slotsRequired - slotsAdded);
965       KMP_ASSERT(new_nthreads >= 1);
966 
967       // If dyn-var is false, emit a 1-time warning.
968       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
969         __kmp_reserve_warn = 1;
970         if (__kmp_tp_cached) {
971           __kmp_msg(kmp_ms_warning,
972                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
973                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
974                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
975         } else {
976           __kmp_msg(kmp_ms_warning,
977                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
978                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
979         }
980       }
981     }
982   }
983 
984 #ifdef KMP_DEBUG
985   if (new_nthreads == 1) {
986     KC_TRACE(10,
987              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
988               "dead roots and rechecking; requested %d threads\n",
989               __kmp_get_gtid(), set_nthreads));
990   } else {
991     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
992                   " %d threads\n",
993                   __kmp_get_gtid(), new_nthreads, set_nthreads));
994   }
995 #endif // KMP_DEBUG
996   return new_nthreads;
997 }
998 
999 /* Allocate threads from the thread pool and assign them to the new team. We are
1000    assured that there are enough threads available, because we checked on that
1001    earlier within critical section forkjoin */
1002 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1003                                     kmp_info_t *master_th, int master_gtid) {
1004   int i;
1005   int use_hot_team;
1006 
1007   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1008   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1009   KMP_MB();
1010 
1011   /* first, let's setup the master thread */
1012   master_th->th.th_info.ds.ds_tid = 0;
1013   master_th->th.th_team = team;
1014   master_th->th.th_team_nproc = team->t.t_nproc;
1015   master_th->th.th_team_master = master_th;
1016   master_th->th.th_team_serialized = FALSE;
1017   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1018 
1019 /* make sure we are not the optimized hot team */
1020 #if KMP_NESTED_HOT_TEAMS
1021   use_hot_team = 0;
1022   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1023   if (hot_teams) { // hot teams array is not allocated if
1024     // KMP_HOT_TEAMS_MAX_LEVEL=0
1025     int level = team->t.t_active_level - 1; // index in array of hot teams
1026     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1027       if (master_th->th.th_teams_size.nteams > 1) {
1028         ++level; // level was not increased in teams construct for
1029         // team_of_masters
1030       }
1031       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1032           master_th->th.th_teams_level == team->t.t_level) {
1033         ++level; // level was not increased in teams construct for
1034         // team_of_workers before the parallel
1035       } // team->t.t_level will be increased inside parallel
1036     }
1037     if (level < __kmp_hot_teams_max_level) {
1038       if (hot_teams[level].hot_team) {
1039         // hot team has already been allocated for given level
1040         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1041         use_hot_team = 1; // the team is ready to use
1042       } else {
1043         use_hot_team = 0; // AC: threads are not allocated yet
1044         hot_teams[level].hot_team = team; // remember new hot team
1045         hot_teams[level].hot_team_nth = team->t.t_nproc;
1046       }
1047     } else {
1048       use_hot_team = 0;
1049     }
1050   }
1051 #else
1052   use_hot_team = team == root->r.r_hot_team;
1053 #endif
1054   if (!use_hot_team) {
1055 
1056     /* install the master thread */
1057     team->t.t_threads[0] = master_th;
1058     __kmp_initialize_info(master_th, team, 0, master_gtid);
1059 
1060     /* now, install the worker threads */
1061     for (i = 1; i < team->t.t_nproc; i++) {
1062 
1063       /* fork or reallocate a new thread and install it in team */
1064       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1065       team->t.t_threads[i] = thr;
1066       KMP_DEBUG_ASSERT(thr);
1067       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1068       /* align team and thread arrived states */
1069       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1070                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1071                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1072                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1073                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1074                     team->t.t_bar[bs_plain_barrier].b_arrived));
1075 #if OMP_40_ENABLED
1076       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1077       thr->th.th_teams_level = master_th->th.th_teams_level;
1078       thr->th.th_teams_size = master_th->th.th_teams_size;
1079 #endif
1080       { // Initialize threads' barrier data.
1081         int b;
1082         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1083         for (b = 0; b < bs_last_barrier; ++b) {
1084           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1085           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1086 #if USE_DEBUGGER
1087           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1088 #endif
1089         }
1090       }
1091     }
1092 
1093 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1094     __kmp_partition_places(team);
1095 #endif
1096   }
1097 
1098 #if OMP_50_ENABLED
1099   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1100     for (i = 0; i < team->t.t_nproc; i++) {
1101       kmp_info_t *thr = team->t.t_threads[i];
1102       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1103           thr->th.th_prev_level != team->t.t_level) {
1104         team->t.t_display_affinity = 1;
1105         break;
1106       }
1107     }
1108   }
1109 #endif
1110 
1111   KMP_MB();
1112 }
1113 
1114 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1115 // Propagate any changes to the floating point control registers out to the team
1116 // We try to avoid unnecessary writes to the relevant cache line in the team
1117 // structure, so we don't make changes unless they are needed.
1118 inline static void propagateFPControl(kmp_team_t *team) {
1119   if (__kmp_inherit_fp_control) {
1120     kmp_int16 x87_fpu_control_word;
1121     kmp_uint32 mxcsr;
1122 
1123     // Get master values of FPU control flags (both X87 and vector)
1124     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1125     __kmp_store_mxcsr(&mxcsr);
1126     mxcsr &= KMP_X86_MXCSR_MASK;
1127 
1128     // There is no point looking at t_fp_control_saved here.
1129     // If it is TRUE, we still have to update the values if they are different
1130     // from those we now have. If it is FALSE we didn't save anything yet, but
1131     // our objective is the same. We have to ensure that the values in the team
1132     // are the same as those we have.
1133     // So, this code achieves what we need whether or not t_fp_control_saved is
1134     // true. By checking whether the value needs updating we avoid unnecessary
1135     // writes that would put the cache-line into a written state, causing all
1136     // threads in the team to have to read it again.
1137     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1138     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1139     // Although we don't use this value, other code in the runtime wants to know
1140     // whether it should restore them. So we must ensure it is correct.
1141     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1142   } else {
1143     // Similarly here. Don't write to this cache-line in the team structure
1144     // unless we have to.
1145     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1146   }
1147 }
1148 
1149 // Do the opposite, setting the hardware registers to the updated values from
1150 // the team.
1151 inline static void updateHWFPControl(kmp_team_t *team) {
1152   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1153     // Only reset the fp control regs if they have been changed in the team.
1154     // the parallel region that we are exiting.
1155     kmp_int16 x87_fpu_control_word;
1156     kmp_uint32 mxcsr;
1157     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1158     __kmp_store_mxcsr(&mxcsr);
1159     mxcsr &= KMP_X86_MXCSR_MASK;
1160 
1161     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1162       __kmp_clear_x87_fpu_status_word();
1163       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1164     }
1165 
1166     if (team->t.t_mxcsr != mxcsr) {
1167       __kmp_load_mxcsr(&team->t.t_mxcsr);
1168     }
1169   }
1170 }
1171 #else
1172 #define propagateFPControl(x) ((void)0)
1173 #define updateHWFPControl(x) ((void)0)
1174 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1175 
1176 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1177                                      int realloc); // forward declaration
1178 
1179 /* Run a parallel region that has been serialized, so runs only in a team of the
1180    single master thread. */
1181 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1182   kmp_info_t *this_thr;
1183   kmp_team_t *serial_team;
1184 
1185   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1186 
1187   /* Skip all this code for autopar serialized loops since it results in
1188      unacceptable overhead */
1189   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1190     return;
1191 
1192   if (!TCR_4(__kmp_init_parallel))
1193     __kmp_parallel_initialize();
1194 
1195 #if OMP_50_ENABLED
1196   __kmp_resume_if_soft_paused();
1197 #endif
1198 
1199   this_thr = __kmp_threads[global_tid];
1200   serial_team = this_thr->th.th_serial_team;
1201 
1202   /* utilize the serialized team held by this thread */
1203   KMP_DEBUG_ASSERT(serial_team);
1204   KMP_MB();
1205 
1206   if (__kmp_tasking_mode != tskm_immediate_exec) {
1207     KMP_DEBUG_ASSERT(
1208         this_thr->th.th_task_team ==
1209         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1210     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1211                      NULL);
1212     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1213                   "team %p, new task_team = NULL\n",
1214                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1215     this_thr->th.th_task_team = NULL;
1216   }
1217 
1218 #if OMP_40_ENABLED
1219   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1220   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1221     proc_bind = proc_bind_false;
1222   } else if (proc_bind == proc_bind_default) {
1223     // No proc_bind clause was specified, so use the current value
1224     // of proc-bind-var for this parallel region.
1225     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1226   }
1227   // Reset for next parallel region
1228   this_thr->th.th_set_proc_bind = proc_bind_default;
1229 #endif /* OMP_40_ENABLED */
1230 
1231 #if OMPT_SUPPORT
1232   ompt_data_t ompt_parallel_data = ompt_data_none;
1233   ompt_data_t *implicit_task_data;
1234   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1235   if (ompt_enabled.enabled &&
1236       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1237 
1238     ompt_task_info_t *parent_task_info;
1239     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1240 
1241     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1242     if (ompt_enabled.ompt_callback_parallel_begin) {
1243       int team_size = 1;
1244 
1245       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1246           &(parent_task_info->task_data), &(parent_task_info->frame),
1247           &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1248           codeptr);
1249     }
1250   }
1251 #endif // OMPT_SUPPORT
1252 
1253   if (this_thr->th.th_team != serial_team) {
1254     // Nested level will be an index in the nested nthreads array
1255     int level = this_thr->th.th_team->t.t_level;
1256 
1257     if (serial_team->t.t_serialized) {
1258       /* this serial team was already used
1259          TODO increase performance by making this locks more specific */
1260       kmp_team_t *new_team;
1261 
1262       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1263 
1264       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1265 #if OMPT_SUPPORT
1266                                      ompt_parallel_data,
1267 #endif
1268 #if OMP_40_ENABLED
1269                                      proc_bind,
1270 #endif
1271                                      &this_thr->th.th_current_task->td_icvs,
1272                                      0 USE_NESTED_HOT_ARG(NULL));
1273       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1274       KMP_ASSERT(new_team);
1275 
1276       /* setup new serialized team and install it */
1277       new_team->t.t_threads[0] = this_thr;
1278       new_team->t.t_parent = this_thr->th.th_team;
1279       serial_team = new_team;
1280       this_thr->th.th_serial_team = serial_team;
1281 
1282       KF_TRACE(
1283           10,
1284           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1285            global_tid, serial_team));
1286 
1287       /* TODO the above breaks the requirement that if we run out of resources,
1288          then we can still guarantee that serialized teams are ok, since we may
1289          need to allocate a new one */
1290     } else {
1291       KF_TRACE(
1292           10,
1293           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1294            global_tid, serial_team));
1295     }
1296 
1297     /* we have to initialize this serial team */
1298     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1299     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1300     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1301     serial_team->t.t_ident = loc;
1302     serial_team->t.t_serialized = 1;
1303     serial_team->t.t_nproc = 1;
1304     serial_team->t.t_parent = this_thr->th.th_team;
1305     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1306     this_thr->th.th_team = serial_team;
1307     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1308 
1309     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1310                   this_thr->th.th_current_task));
1311     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1312     this_thr->th.th_current_task->td_flags.executing = 0;
1313 
1314     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1315 
1316     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1317        implicit task for each serialized task represented by
1318        team->t.t_serialized? */
1319     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1320               &this_thr->th.th_current_task->td_parent->td_icvs);
1321 
1322     // Thread value exists in the nested nthreads array for the next nested
1323     // level
1324     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1325       this_thr->th.th_current_task->td_icvs.nproc =
1326           __kmp_nested_nth.nth[level + 1];
1327     }
1328 
1329 #if OMP_40_ENABLED
1330     if (__kmp_nested_proc_bind.used &&
1331         (level + 1 < __kmp_nested_proc_bind.used)) {
1332       this_thr->th.th_current_task->td_icvs.proc_bind =
1333           __kmp_nested_proc_bind.bind_types[level + 1];
1334     }
1335 #endif /* OMP_40_ENABLED */
1336 
1337 #if USE_DEBUGGER
1338     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1339 #endif
1340     this_thr->th.th_info.ds.ds_tid = 0;
1341 
1342     /* set thread cache values */
1343     this_thr->th.th_team_nproc = 1;
1344     this_thr->th.th_team_master = this_thr;
1345     this_thr->th.th_team_serialized = 1;
1346 
1347     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1348     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1349 #if OMP_50_ENABLED
1350     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1351 #endif
1352 
1353     propagateFPControl(serial_team);
1354 
1355     /* check if we need to allocate dispatch buffers stack */
1356     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1357     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1358       serial_team->t.t_dispatch->th_disp_buffer =
1359           (dispatch_private_info_t *)__kmp_allocate(
1360               sizeof(dispatch_private_info_t));
1361     }
1362     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1363 
1364     KMP_MB();
1365 
1366   } else {
1367     /* this serialized team is already being used,
1368      * that's fine, just add another nested level */
1369     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1370     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1371     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1372     ++serial_team->t.t_serialized;
1373     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1374 
1375     // Nested level will be an index in the nested nthreads array
1376     int level = this_thr->th.th_team->t.t_level;
1377     // Thread value exists in the nested nthreads array for the next nested
1378     // level
1379     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1380       this_thr->th.th_current_task->td_icvs.nproc =
1381           __kmp_nested_nth.nth[level + 1];
1382     }
1383     serial_team->t.t_level++;
1384     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1385                   "of serial team %p to %d\n",
1386                   global_tid, serial_team, serial_team->t.t_level));
1387 
1388     /* allocate/push dispatch buffers stack */
1389     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1390     {
1391       dispatch_private_info_t *disp_buffer =
1392           (dispatch_private_info_t *)__kmp_allocate(
1393               sizeof(dispatch_private_info_t));
1394       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1395       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1396     }
1397     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1398 
1399     KMP_MB();
1400   }
1401 #if OMP_40_ENABLED
1402   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1403 #endif
1404 
1405 #if OMP_50_ENABLED
1406   // Perform the display affinity functionality for
1407   // serialized parallel regions
1408   if (__kmp_display_affinity) {
1409     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1410         this_thr->th.th_prev_num_threads != 1) {
1411       // NULL means use the affinity-format-var ICV
1412       __kmp_aux_display_affinity(global_tid, NULL);
1413       this_thr->th.th_prev_level = serial_team->t.t_level;
1414       this_thr->th.th_prev_num_threads = 1;
1415     }
1416   }
1417 #endif
1418 
1419   if (__kmp_env_consistency_check)
1420     __kmp_push_parallel(global_tid, NULL);
1421 #if OMPT_SUPPORT
1422   serial_team->t.ompt_team_info.master_return_address = codeptr;
1423   if (ompt_enabled.enabled &&
1424       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1425     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1426 
1427     ompt_lw_taskteam_t lw_taskteam;
1428     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1429                             &ompt_parallel_data, codeptr);
1430 
1431     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1432     // don't use lw_taskteam after linking. content was swaped
1433 
1434     /* OMPT implicit task begin */
1435     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1436     if (ompt_enabled.ompt_callback_implicit_task) {
1437       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1438           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1439           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1440       OMPT_CUR_TASK_INFO(this_thr)
1441           ->thread_num = __kmp_tid_from_gtid(global_tid);
1442     }
1443 
1444     /* OMPT state */
1445     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1446     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1447   }
1448 #endif
1449 }
1450 
1451 /* most of the work for a fork */
1452 /* return true if we really went parallel, false if serialized */
1453 int __kmp_fork_call(ident_t *loc, int gtid,
1454                     enum fork_context_e call_context, // Intel, GNU, ...
1455                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1456 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1457 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1458                     va_list *ap
1459 #else
1460                     va_list ap
1461 #endif
1462                     ) {
1463   void **argv;
1464   int i;
1465   int master_tid;
1466   int master_this_cons;
1467   kmp_team_t *team;
1468   kmp_team_t *parent_team;
1469   kmp_info_t *master_th;
1470   kmp_root_t *root;
1471   int nthreads;
1472   int master_active;
1473   int master_set_numthreads;
1474   int level;
1475 #if OMP_40_ENABLED
1476   int active_level;
1477   int teams_level;
1478 #endif
1479 #if KMP_NESTED_HOT_TEAMS
1480   kmp_hot_team_ptr_t **p_hot_teams;
1481 #endif
1482   { // KMP_TIME_BLOCK
1483     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1484     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1485 
1486     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1487     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1488       /* Some systems prefer the stack for the root thread(s) to start with */
1489       /* some gap from the parent stack to prevent false sharing. */
1490       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1491       /* These 2 lines below are so this does not get optimized out */
1492       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1493         __kmp_stkpadding += (short)((kmp_int64)dummy);
1494     }
1495 
1496     /* initialize if needed */
1497     KMP_DEBUG_ASSERT(
1498         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1499     if (!TCR_4(__kmp_init_parallel))
1500       __kmp_parallel_initialize();
1501 
1502 #if OMP_50_ENABLED
1503     __kmp_resume_if_soft_paused();
1504 #endif
1505 
1506     /* setup current data */
1507     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1508     // shutdown
1509     parent_team = master_th->th.th_team;
1510     master_tid = master_th->th.th_info.ds.ds_tid;
1511     master_this_cons = master_th->th.th_local.this_construct;
1512     root = master_th->th.th_root;
1513     master_active = root->r.r_active;
1514     master_set_numthreads = master_th->th.th_set_nproc;
1515 
1516 #if OMPT_SUPPORT
1517     ompt_data_t ompt_parallel_data = ompt_data_none;
1518     ompt_data_t *parent_task_data;
1519     ompt_frame_t *ompt_frame;
1520     ompt_data_t *implicit_task_data;
1521     void *return_address = NULL;
1522 
1523     if (ompt_enabled.enabled) {
1524       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1525                                     NULL, NULL);
1526       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1527     }
1528 #endif
1529 
1530     // Nested level will be an index in the nested nthreads array
1531     level = parent_team->t.t_level;
1532     // used to launch non-serial teams even if nested is not allowed
1533     active_level = parent_team->t.t_active_level;
1534 #if OMP_40_ENABLED
1535     // needed to check nesting inside the teams
1536     teams_level = master_th->th.th_teams_level;
1537 #endif
1538 #if KMP_NESTED_HOT_TEAMS
1539     p_hot_teams = &master_th->th.th_hot_teams;
1540     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1541       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1542           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1543       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1544       // it is either actual or not needed (when active_level > 0)
1545       (*p_hot_teams)[0].hot_team_nth = 1;
1546     }
1547 #endif
1548 
1549 #if OMPT_SUPPORT
1550     if (ompt_enabled.enabled) {
1551       if (ompt_enabled.ompt_callback_parallel_begin) {
1552         int team_size = master_set_numthreads
1553                             ? master_set_numthreads
1554                             : get__nproc_2(parent_team, master_tid);
1555         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1556             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1557             OMPT_INVOKER(call_context), return_address);
1558       }
1559       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1560     }
1561 #endif
1562 
1563     master_th->th.th_ident = loc;
1564 
1565 #if OMP_40_ENABLED
1566     if (master_th->th.th_teams_microtask && ap &&
1567         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1568       // AC: This is start of parallel that is nested inside teams construct.
1569       // The team is actual (hot), all workers are ready at the fork barrier.
1570       // No lock needed to initialize the team a bit, then free workers.
1571       parent_team->t.t_ident = loc;
1572       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1573       parent_team->t.t_argc = argc;
1574       argv = (void **)parent_team->t.t_argv;
1575       for (i = argc - 1; i >= 0; --i)
1576 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1577 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1578         *argv++ = va_arg(*ap, void *);
1579 #else
1580         *argv++ = va_arg(ap, void *);
1581 #endif
1582       // Increment our nested depth levels, but not increase the serialization
1583       if (parent_team == master_th->th.th_serial_team) {
1584         // AC: we are in serialized parallel
1585         __kmpc_serialized_parallel(loc, gtid);
1586         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1587         // AC: need this in order enquiry functions work
1588         // correctly, will restore at join time
1589         parent_team->t.t_serialized--;
1590 #if OMPT_SUPPORT
1591         void *dummy;
1592         void **exit_runtime_p;
1593 
1594         ompt_lw_taskteam_t lw_taskteam;
1595 
1596         if (ompt_enabled.enabled) {
1597           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1598                                   &ompt_parallel_data, return_address);
1599           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1600 
1601           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1602           // don't use lw_taskteam after linking. content was swaped
1603 
1604           /* OMPT implicit task begin */
1605           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1606           if (ompt_enabled.ompt_callback_implicit_task) {
1607             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1608                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1609                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1610             OMPT_CUR_TASK_INFO(master_th)
1611                 ->thread_num = __kmp_tid_from_gtid(gtid);
1612           }
1613 
1614           /* OMPT state */
1615           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1616         } else {
1617           exit_runtime_p = &dummy;
1618         }
1619 #endif
1620 
1621         {
1622           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1623           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1624           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1625 #if OMPT_SUPPORT
1626                                  ,
1627                                  exit_runtime_p
1628 #endif
1629                                  );
1630         }
1631 
1632 #if OMPT_SUPPORT
1633         *exit_runtime_p = NULL;
1634         if (ompt_enabled.enabled) {
1635           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1636           if (ompt_enabled.ompt_callback_implicit_task) {
1637             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1638                 ompt_scope_end, NULL, implicit_task_data, 1,
1639                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1640           }
1641           __ompt_lw_taskteam_unlink(master_th);
1642 
1643           if (ompt_enabled.ompt_callback_parallel_end) {
1644             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1645                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1646                 OMPT_INVOKER(call_context), return_address);
1647           }
1648           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1649         }
1650 #endif
1651         return TRUE;
1652       }
1653 
1654       parent_team->t.t_pkfn = microtask;
1655       parent_team->t.t_invoke = invoker;
1656       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1657       parent_team->t.t_active_level++;
1658       parent_team->t.t_level++;
1659 #if OMP_50_ENABLED
1660       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1661 #endif
1662 
1663       /* Change number of threads in the team if requested */
1664       if (master_set_numthreads) { // The parallel has num_threads clause
1665         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1666           // AC: only can reduce number of threads dynamically, can't increase
1667           kmp_info_t **other_threads = parent_team->t.t_threads;
1668           parent_team->t.t_nproc = master_set_numthreads;
1669           for (i = 0; i < master_set_numthreads; ++i) {
1670             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1671           }
1672           // Keep extra threads hot in the team for possible next parallels
1673         }
1674         master_th->th.th_set_nproc = 0;
1675       }
1676 
1677 #if USE_DEBUGGER
1678       if (__kmp_debugging) { // Let debugger override number of threads.
1679         int nth = __kmp_omp_num_threads(loc);
1680         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1681           master_set_numthreads = nth;
1682         }
1683       }
1684 #endif
1685 
1686       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1687                     "master_th=%p, gtid=%d\n",
1688                     root, parent_team, master_th, gtid));
1689       __kmp_internal_fork(loc, gtid, parent_team);
1690       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1691                     "master_th=%p, gtid=%d\n",
1692                     root, parent_team, master_th, gtid));
1693 
1694       /* Invoke microtask for MASTER thread */
1695       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1696                     parent_team->t.t_id, parent_team->t.t_pkfn));
1697 
1698       if (!parent_team->t.t_invoke(gtid)) {
1699         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1700       }
1701       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1702                     parent_team->t.t_id, parent_team->t.t_pkfn));
1703       KMP_MB(); /* Flush all pending memory write invalidates.  */
1704 
1705       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1706 
1707       return TRUE;
1708     } // Parallel closely nested in teams construct
1709 #endif /* OMP_40_ENABLED */
1710 
1711 #if KMP_DEBUG
1712     if (__kmp_tasking_mode != tskm_immediate_exec) {
1713       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1714                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1715     }
1716 #endif
1717 
1718     if (parent_team->t.t_active_level >=
1719         master_th->th.th_current_task->td_icvs.max_active_levels) {
1720       nthreads = 1;
1721     } else {
1722 #if OMP_40_ENABLED
1723       int enter_teams = ((ap == NULL && active_level == 0) ||
1724                          (ap && teams_level > 0 && teams_level == level));
1725 #endif
1726       nthreads =
1727           master_set_numthreads
1728               ? master_set_numthreads
1729               : get__nproc_2(
1730                     parent_team,
1731                     master_tid); // TODO: get nproc directly from current task
1732 
1733       // Check if we need to take forkjoin lock? (no need for serialized
1734       // parallel out of teams construct). This code moved here from
1735       // __kmp_reserve_threads() to speedup nested serialized parallels.
1736       if (nthreads > 1) {
1737         if ((!get__nested(master_th) && (root->r.r_in_parallel
1738 #if OMP_40_ENABLED
1739                                          && !enter_teams
1740 #endif /* OMP_40_ENABLED */
1741                                          )) ||
1742             (__kmp_library == library_serial)) {
1743           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1744                         " threads\n",
1745                         gtid, nthreads));
1746           nthreads = 1;
1747         }
1748       }
1749       if (nthreads > 1) {
1750         /* determine how many new threads we can use */
1751         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1752         nthreads = __kmp_reserve_threads(
1753             root, parent_team, master_tid, nthreads
1754 #if OMP_40_ENABLED
1755             /* AC: If we execute teams from parallel region (on host), then
1756                teams should be created but each can only have 1 thread if
1757                nesting is disabled. If teams called from serial region, then
1758                teams and their threads should be created regardless of the
1759                nesting setting. */
1760             ,
1761             enter_teams
1762 #endif /* OMP_40_ENABLED */
1763             );
1764         if (nthreads == 1) {
1765           // Free lock for single thread execution here; for multi-thread
1766           // execution it will be freed later after team of threads created
1767           // and initialized
1768           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1769         }
1770       }
1771     }
1772     KMP_DEBUG_ASSERT(nthreads > 0);
1773 
1774     // If we temporarily changed the set number of threads then restore it now
1775     master_th->th.th_set_nproc = 0;
1776 
1777     /* create a serialized parallel region? */
1778     if (nthreads == 1) {
1779 /* josh todo: hypothetical question: what do we do for OS X*? */
1780 #if KMP_OS_LINUX &&                                                            \
1781     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1782       void *args[argc];
1783 #else
1784       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1785 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1786           KMP_ARCH_AARCH64) */
1787 
1788       KA_TRACE(20,
1789                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1790 
1791       __kmpc_serialized_parallel(loc, gtid);
1792 
1793       if (call_context == fork_context_intel) {
1794         /* TODO this sucks, use the compiler itself to pass args! :) */
1795         master_th->th.th_serial_team->t.t_ident = loc;
1796 #if OMP_40_ENABLED
1797         if (!ap) {
1798           // revert change made in __kmpc_serialized_parallel()
1799           master_th->th.th_serial_team->t.t_level--;
1800 // Get args from parent team for teams construct
1801 
1802 #if OMPT_SUPPORT
1803           void *dummy;
1804           void **exit_runtime_p;
1805           ompt_task_info_t *task_info;
1806 
1807           ompt_lw_taskteam_t lw_taskteam;
1808 
1809           if (ompt_enabled.enabled) {
1810             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1811                                     &ompt_parallel_data, return_address);
1812 
1813             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1814             // don't use lw_taskteam after linking. content was swaped
1815 
1816             task_info = OMPT_CUR_TASK_INFO(master_th);
1817             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1818             if (ompt_enabled.ompt_callback_implicit_task) {
1819               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1820                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1821                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1822               OMPT_CUR_TASK_INFO(master_th)
1823                   ->thread_num = __kmp_tid_from_gtid(gtid);
1824             }
1825 
1826             /* OMPT state */
1827             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1828           } else {
1829             exit_runtime_p = &dummy;
1830           }
1831 #endif
1832 
1833           {
1834             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1835             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1836             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1837                                    parent_team->t.t_argv
1838 #if OMPT_SUPPORT
1839                                    ,
1840                                    exit_runtime_p
1841 #endif
1842                                    );
1843           }
1844 
1845 #if OMPT_SUPPORT
1846           if (ompt_enabled.enabled) {
1847             exit_runtime_p = NULL;
1848             if (ompt_enabled.ompt_callback_implicit_task) {
1849               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1850                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1851                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1852             }
1853 
1854             __ompt_lw_taskteam_unlink(master_th);
1855             if (ompt_enabled.ompt_callback_parallel_end) {
1856               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1857                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1858                   OMPT_INVOKER(call_context), return_address);
1859             }
1860             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1861           }
1862 #endif
1863         } else if (microtask == (microtask_t)__kmp_teams_master) {
1864           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1865                            master_th->th.th_serial_team);
1866           team = master_th->th.th_team;
1867           // team->t.t_pkfn = microtask;
1868           team->t.t_invoke = invoker;
1869           __kmp_alloc_argv_entries(argc, team, TRUE);
1870           team->t.t_argc = argc;
1871           argv = (void **)team->t.t_argv;
1872           if (ap) {
1873             for (i = argc - 1; i >= 0; --i)
1874 // TODO: revert workaround for Intel(R) 64 tracker #96
1875 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1876               *argv++ = va_arg(*ap, void *);
1877 #else
1878               *argv++ = va_arg(ap, void *);
1879 #endif
1880           } else {
1881             for (i = 0; i < argc; ++i)
1882               // Get args from parent team for teams construct
1883               argv[i] = parent_team->t.t_argv[i];
1884           }
1885           // AC: revert change made in __kmpc_serialized_parallel()
1886           //     because initial code in teams should have level=0
1887           team->t.t_level--;
1888           // AC: call special invoker for outer "parallel" of teams construct
1889           invoker(gtid);
1890         } else {
1891 #endif /* OMP_40_ENABLED */
1892           argv = args;
1893           for (i = argc - 1; i >= 0; --i)
1894 // TODO: revert workaround for Intel(R) 64 tracker #96
1895 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1896             *argv++ = va_arg(*ap, void *);
1897 #else
1898           *argv++ = va_arg(ap, void *);
1899 #endif
1900           KMP_MB();
1901 
1902 #if OMPT_SUPPORT
1903           void *dummy;
1904           void **exit_runtime_p;
1905           ompt_task_info_t *task_info;
1906 
1907           ompt_lw_taskteam_t lw_taskteam;
1908 
1909           if (ompt_enabled.enabled) {
1910             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1911                                     &ompt_parallel_data, return_address);
1912             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1913             // don't use lw_taskteam after linking. content was swaped
1914             task_info = OMPT_CUR_TASK_INFO(master_th);
1915             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1916 
1917             /* OMPT implicit task begin */
1918             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1919             if (ompt_enabled.ompt_callback_implicit_task) {
1920               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1921                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1922                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1923               OMPT_CUR_TASK_INFO(master_th)
1924                   ->thread_num = __kmp_tid_from_gtid(gtid);
1925             }
1926 
1927             /* OMPT state */
1928             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1929           } else {
1930             exit_runtime_p = &dummy;
1931           }
1932 #endif
1933 
1934           {
1935             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1936             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1937             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1938 #if OMPT_SUPPORT
1939                                    ,
1940                                    exit_runtime_p
1941 #endif
1942                                    );
1943           }
1944 
1945 #if OMPT_SUPPORT
1946           if (ompt_enabled.enabled) {
1947             *exit_runtime_p = NULL;
1948             if (ompt_enabled.ompt_callback_implicit_task) {
1949               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1950                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1951                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1952             }
1953 
1954             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1955             __ompt_lw_taskteam_unlink(master_th);
1956             if (ompt_enabled.ompt_callback_parallel_end) {
1957               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1958                   &ompt_parallel_data, parent_task_data,
1959                   OMPT_INVOKER(call_context), return_address);
1960             }
1961             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1962           }
1963 #endif
1964 #if OMP_40_ENABLED
1965         }
1966 #endif /* OMP_40_ENABLED */
1967       } else if (call_context == fork_context_gnu) {
1968 #if OMPT_SUPPORT
1969         ompt_lw_taskteam_t lwt;
1970         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1971                                 return_address);
1972 
1973         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1974         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1975 // don't use lw_taskteam after linking. content was swaped
1976 #endif
1977 
1978         // we were called from GNU native code
1979         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1980         return FALSE;
1981       } else {
1982         KMP_ASSERT2(call_context < fork_context_last,
1983                     "__kmp_fork_call: unknown fork_context parameter");
1984       }
1985 
1986       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1987       KMP_MB();
1988       return FALSE;
1989     } // if (nthreads == 1)
1990 
1991     // GEH: only modify the executing flag in the case when not serialized
1992     //      serialized case is handled in kmpc_serialized_parallel
1993     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1994                   "curtask=%p, curtask_max_aclevel=%d\n",
1995                   parent_team->t.t_active_level, master_th,
1996                   master_th->th.th_current_task,
1997                   master_th->th.th_current_task->td_icvs.max_active_levels));
1998     // TODO: GEH - cannot do this assertion because root thread not set up as
1999     // executing
2000     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2001     master_th->th.th_current_task->td_flags.executing = 0;
2002 
2003 #if OMP_40_ENABLED
2004     if (!master_th->th.th_teams_microtask || level > teams_level)
2005 #endif /* OMP_40_ENABLED */
2006     {
2007       /* Increment our nested depth level */
2008       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2009     }
2010 
2011     // See if we need to make a copy of the ICVs.
2012     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2013     if ((level + 1 < __kmp_nested_nth.used) &&
2014         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2015       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2016     } else {
2017       nthreads_icv = 0; // don't update
2018     }
2019 
2020 #if OMP_40_ENABLED
2021     // Figure out the proc_bind_policy for the new team.
2022     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2023     kmp_proc_bind_t proc_bind_icv =
2024         proc_bind_default; // proc_bind_default means don't update
2025     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2026       proc_bind = proc_bind_false;
2027     } else {
2028       if (proc_bind == proc_bind_default) {
2029         // No proc_bind clause specified; use current proc-bind-var for this
2030         // parallel region
2031         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2032       }
2033       /* else: The proc_bind policy was specified explicitly on parallel clause.
2034          This overrides proc-bind-var for this parallel region, but does not
2035          change proc-bind-var. */
2036       // Figure the value of proc-bind-var for the child threads.
2037       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2038           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2039            master_th->th.th_current_task->td_icvs.proc_bind)) {
2040         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2041       }
2042     }
2043 
2044     // Reset for next parallel region
2045     master_th->th.th_set_proc_bind = proc_bind_default;
2046 #endif /* OMP_40_ENABLED */
2047 
2048     if ((nthreads_icv > 0)
2049 #if OMP_40_ENABLED
2050         || (proc_bind_icv != proc_bind_default)
2051 #endif /* OMP_40_ENABLED */
2052             ) {
2053       kmp_internal_control_t new_icvs;
2054       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2055       new_icvs.next = NULL;
2056       if (nthreads_icv > 0) {
2057         new_icvs.nproc = nthreads_icv;
2058       }
2059 
2060 #if OMP_40_ENABLED
2061       if (proc_bind_icv != proc_bind_default) {
2062         new_icvs.proc_bind = proc_bind_icv;
2063       }
2064 #endif /* OMP_40_ENABLED */
2065 
2066       /* allocate a new parallel team */
2067       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2068       team = __kmp_allocate_team(root, nthreads, nthreads,
2069 #if OMPT_SUPPORT
2070                                  ompt_parallel_data,
2071 #endif
2072 #if OMP_40_ENABLED
2073                                  proc_bind,
2074 #endif
2075                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2076     } else {
2077       /* allocate a new parallel team */
2078       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2079       team = __kmp_allocate_team(root, nthreads, nthreads,
2080 #if OMPT_SUPPORT
2081                                  ompt_parallel_data,
2082 #endif
2083 #if OMP_40_ENABLED
2084                                  proc_bind,
2085 #endif
2086                                  &master_th->th.th_current_task->td_icvs,
2087                                  argc USE_NESTED_HOT_ARG(master_th));
2088     }
2089     KF_TRACE(
2090         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2091 
2092     /* setup the new team */
2093     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2094     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2095     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2096     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2097     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2098 #if OMPT_SUPPORT
2099     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2100                           return_address);
2101 #endif
2102     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2103 // TODO: parent_team->t.t_level == INT_MAX ???
2104 #if OMP_40_ENABLED
2105     if (!master_th->th.th_teams_microtask || level > teams_level) {
2106 #endif /* OMP_40_ENABLED */
2107       int new_level = parent_team->t.t_level + 1;
2108       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2109       new_level = parent_team->t.t_active_level + 1;
2110       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2111 #if OMP_40_ENABLED
2112     } else {
2113       // AC: Do not increase parallel level at start of the teams construct
2114       int new_level = parent_team->t.t_level;
2115       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2116       new_level = parent_team->t.t_active_level;
2117       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2118     }
2119 #endif /* OMP_40_ENABLED */
2120     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2121     // set master's schedule as new run-time schedule
2122     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2123 
2124 #if OMP_40_ENABLED
2125     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2126 #endif
2127 #if OMP_50_ENABLED
2128     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2129 #endif
2130 
2131     // Update the floating point rounding in the team if required.
2132     propagateFPControl(team);
2133 
2134     if (__kmp_tasking_mode != tskm_immediate_exec) {
2135       // Set master's task team to team's task team. Unless this is hot team, it
2136       // should be NULL.
2137       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2138                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2139       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2140                     "%p, new task_team %p / team %p\n",
2141                     __kmp_gtid_from_thread(master_th),
2142                     master_th->th.th_task_team, parent_team,
2143                     team->t.t_task_team[master_th->th.th_task_state], team));
2144 
2145       if (active_level || master_th->th.th_task_team) {
2146         // Take a memo of master's task_state
2147         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2148         if (master_th->th.th_task_state_top >=
2149             master_th->th.th_task_state_stack_sz) { // increase size
2150           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2151           kmp_uint8 *old_stack, *new_stack;
2152           kmp_uint32 i;
2153           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2154           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2155             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2156           }
2157           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2158                ++i) { // zero-init rest of stack
2159             new_stack[i] = 0;
2160           }
2161           old_stack = master_th->th.th_task_state_memo_stack;
2162           master_th->th.th_task_state_memo_stack = new_stack;
2163           master_th->th.th_task_state_stack_sz = new_size;
2164           __kmp_free(old_stack);
2165         }
2166         // Store master's task_state on stack
2167         master_th->th
2168             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2169             master_th->th.th_task_state;
2170         master_th->th.th_task_state_top++;
2171 #if KMP_NESTED_HOT_TEAMS
2172         if (master_th->th.th_hot_teams &&
2173             active_level < __kmp_hot_teams_max_level &&
2174             team == master_th->th.th_hot_teams[active_level].hot_team) {
2175           // Restore master's nested state if nested hot team
2176           master_th->th.th_task_state =
2177               master_th->th
2178                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2179         } else {
2180 #endif
2181           master_th->th.th_task_state = 0;
2182 #if KMP_NESTED_HOT_TEAMS
2183         }
2184 #endif
2185       }
2186 #if !KMP_NESTED_HOT_TEAMS
2187       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2188                        (team == root->r.r_hot_team));
2189 #endif
2190     }
2191 
2192     KA_TRACE(
2193         20,
2194         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2195          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2196          team->t.t_nproc));
2197     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2198                      (team->t.t_master_tid == 0 &&
2199                       (team->t.t_parent == root->r.r_root_team ||
2200                        team->t.t_parent->t.t_serialized)));
2201     KMP_MB();
2202 
2203     /* now, setup the arguments */
2204     argv = (void **)team->t.t_argv;
2205 #if OMP_40_ENABLED
2206     if (ap) {
2207 #endif /* OMP_40_ENABLED */
2208       for (i = argc - 1; i >= 0; --i) {
2209 // TODO: revert workaround for Intel(R) 64 tracker #96
2210 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2211         void *new_argv = va_arg(*ap, void *);
2212 #else
2213       void *new_argv = va_arg(ap, void *);
2214 #endif
2215         KMP_CHECK_UPDATE(*argv, new_argv);
2216         argv++;
2217       }
2218 #if OMP_40_ENABLED
2219     } else {
2220       for (i = 0; i < argc; ++i) {
2221         // Get args from parent team for teams construct
2222         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2223       }
2224     }
2225 #endif /* OMP_40_ENABLED */
2226 
2227     /* now actually fork the threads */
2228     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2229     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2230       root->r.r_active = TRUE;
2231 
2232     __kmp_fork_team_threads(root, team, master_th, gtid);
2233     __kmp_setup_icv_copy(team, nthreads,
2234                          &master_th->th.th_current_task->td_icvs, loc);
2235 
2236 #if OMPT_SUPPORT
2237     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2238 #endif
2239 
2240     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2241 
2242 #if USE_ITT_BUILD
2243     if (team->t.t_active_level == 1 // only report frames at level 1
2244 #if OMP_40_ENABLED
2245         && !master_th->th.th_teams_microtask // not in teams construct
2246 #endif /* OMP_40_ENABLED */
2247         ) {
2248 #if USE_ITT_NOTIFY
2249       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2250           (__kmp_forkjoin_frames_mode == 3 ||
2251            __kmp_forkjoin_frames_mode == 1)) {
2252         kmp_uint64 tmp_time = 0;
2253         if (__itt_get_timestamp_ptr)
2254           tmp_time = __itt_get_timestamp();
2255         // Internal fork - report frame begin
2256         master_th->th.th_frame_time = tmp_time;
2257         if (__kmp_forkjoin_frames_mode == 3)
2258           team->t.t_region_time = tmp_time;
2259       } else
2260 // only one notification scheme (either "submit" or "forking/joined", not both)
2261 #endif /* USE_ITT_NOTIFY */
2262           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2263               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2264         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2265         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2266       }
2267     }
2268 #endif /* USE_ITT_BUILD */
2269 
2270     /* now go on and do the work */
2271     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2272     KMP_MB();
2273     KF_TRACE(10,
2274              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2275               root, team, master_th, gtid));
2276 
2277 #if USE_ITT_BUILD
2278     if (__itt_stack_caller_create_ptr) {
2279       team->t.t_stack_id =
2280           __kmp_itt_stack_caller_create(); // create new stack stitching id
2281       // before entering fork barrier
2282     }
2283 #endif /* USE_ITT_BUILD */
2284 
2285 #if OMP_40_ENABLED
2286     // AC: skip __kmp_internal_fork at teams construct, let only master
2287     // threads execute
2288     if (ap)
2289 #endif /* OMP_40_ENABLED */
2290     {
2291       __kmp_internal_fork(loc, gtid, team);
2292       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2293                     "master_th=%p, gtid=%d\n",
2294                     root, team, master_th, gtid));
2295     }
2296 
2297     if (call_context == fork_context_gnu) {
2298       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2299       return TRUE;
2300     }
2301 
2302     /* Invoke microtask for MASTER thread */
2303     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2304                   team->t.t_id, team->t.t_pkfn));
2305   } // END of timer KMP_fork_call block
2306 
2307   if (!team->t.t_invoke(gtid)) {
2308     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2309   }
2310   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2311                 team->t.t_id, team->t.t_pkfn));
2312   KMP_MB(); /* Flush all pending memory write invalidates.  */
2313 
2314   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2315 
2316 #if OMPT_SUPPORT
2317   if (ompt_enabled.enabled) {
2318     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2319   }
2320 #endif
2321 
2322   return TRUE;
2323 }
2324 
2325 #if OMPT_SUPPORT
2326 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2327                                             kmp_team_t *team) {
2328   // restore state outside the region
2329   thread->th.ompt_thread_info.state =
2330       ((team->t.t_serialized) ? ompt_state_work_serial
2331                               : ompt_state_work_parallel);
2332 }
2333 
2334 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2335                                    kmp_team_t *team, ompt_data_t *parallel_data,
2336                                    fork_context_e fork_context, void *codeptr) {
2337   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2338   if (ompt_enabled.ompt_callback_parallel_end) {
2339     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2340         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2341         codeptr);
2342   }
2343 
2344   task_info->frame.enter_frame = ompt_data_none;
2345   __kmp_join_restore_state(thread, team);
2346 }
2347 #endif
2348 
2349 void __kmp_join_call(ident_t *loc, int gtid
2350 #if OMPT_SUPPORT
2351                      ,
2352                      enum fork_context_e fork_context
2353 #endif
2354 #if OMP_40_ENABLED
2355                      ,
2356                      int exit_teams
2357 #endif /* OMP_40_ENABLED */
2358                      ) {
2359   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2360   kmp_team_t *team;
2361   kmp_team_t *parent_team;
2362   kmp_info_t *master_th;
2363   kmp_root_t *root;
2364   int master_active;
2365   int i;
2366 
2367   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2368 
2369   /* setup current data */
2370   master_th = __kmp_threads[gtid];
2371   root = master_th->th.th_root;
2372   team = master_th->th.th_team;
2373   parent_team = team->t.t_parent;
2374 
2375   master_th->th.th_ident = loc;
2376 
2377 #if OMPT_SUPPORT
2378   if (ompt_enabled.enabled) {
2379     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2380   }
2381 #endif
2382 
2383 #if KMP_DEBUG
2384   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2385     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2386                   "th_task_team = %p\n",
2387                   __kmp_gtid_from_thread(master_th), team,
2388                   team->t.t_task_team[master_th->th.th_task_state],
2389                   master_th->th.th_task_team));
2390     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2391                      team->t.t_task_team[master_th->th.th_task_state]);
2392   }
2393 #endif
2394 
2395   if (team->t.t_serialized) {
2396 #if OMP_40_ENABLED
2397     if (master_th->th.th_teams_microtask) {
2398       // We are in teams construct
2399       int level = team->t.t_level;
2400       int tlevel = master_th->th.th_teams_level;
2401       if (level == tlevel) {
2402         // AC: we haven't incremented it earlier at start of teams construct,
2403         //     so do it here - at the end of teams construct
2404         team->t.t_level++;
2405       } else if (level == tlevel + 1) {
2406         // AC: we are exiting parallel inside teams, need to increment
2407         // serialization in order to restore it in the next call to
2408         // __kmpc_end_serialized_parallel
2409         team->t.t_serialized++;
2410       }
2411     }
2412 #endif /* OMP_40_ENABLED */
2413     __kmpc_end_serialized_parallel(loc, gtid);
2414 
2415 #if OMPT_SUPPORT
2416     if (ompt_enabled.enabled) {
2417       __kmp_join_restore_state(master_th, parent_team);
2418     }
2419 #endif
2420 
2421     return;
2422   }
2423 
2424   master_active = team->t.t_master_active;
2425 
2426 #if OMP_40_ENABLED
2427   if (!exit_teams)
2428 #endif /* OMP_40_ENABLED */
2429   {
2430     // AC: No barrier for internal teams at exit from teams construct.
2431     //     But there is barrier for external team (league).
2432     __kmp_internal_join(loc, gtid, team);
2433   }
2434 #if OMP_40_ENABLED
2435   else {
2436     master_th->th.th_task_state =
2437         0; // AC: no tasking in teams (out of any parallel)
2438   }
2439 #endif /* OMP_40_ENABLED */
2440 
2441   KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445   void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449   if (__itt_stack_caller_create_ptr) {
2450     __kmp_itt_stack_caller_destroy(
2451         (__itt_caller)team->t
2452             .t_stack_id); // destroy the stack stitching id after join barrier
2453   }
2454 
2455   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2456   if (team->t.t_active_level == 1
2457 #if OMP_40_ENABLED
2458       && !master_th->th.th_teams_microtask /* not in teams construct */
2459 #endif /* OMP_40_ENABLED */
2460       ) {
2461     master_th->th.th_ident = loc;
2462     // only one notification scheme (either "submit" or "forking/joined", not
2463     // both)
2464     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2465         __kmp_forkjoin_frames_mode == 3)
2466       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2467                              master_th->th.th_frame_time, 0, loc,
2468                              master_th->th.th_team_nproc, 1);
2469     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2470              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2471       __kmp_itt_region_joined(gtid);
2472   } // active_level == 1
2473 #endif /* USE_ITT_BUILD */
2474 
2475 #if OMP_40_ENABLED
2476   if (master_th->th.th_teams_microtask && !exit_teams &&
2477       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2478       team->t.t_level == master_th->th.th_teams_level + 1) {
2479     // AC: We need to leave the team structure intact at the end of parallel
2480     // inside the teams construct, so that at the next parallel same (hot) team
2481     // works, only adjust nesting levels
2482 
2483     /* Decrement our nested depth level */
2484     team->t.t_level--;
2485     team->t.t_active_level--;
2486     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487 
2488     /* Restore number of threads in the team if needed */
2489     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2490       int old_num = master_th->th.th_team_nproc;
2491       int new_num = master_th->th.th_teams_size.nth;
2492       kmp_info_t **other_threads = team->t.t_threads;
2493       team->t.t_nproc = new_num;
2494       for (i = 0; i < old_num; ++i) {
2495         other_threads[i]->th.th_team_nproc = new_num;
2496       }
2497       // Adjust states of non-used threads of the team
2498       for (i = old_num; i < new_num; ++i) {
2499         // Re-initialize thread's barrier data.
2500         int b;
2501         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2502         for (b = 0; b < bs_last_barrier; ++b) {
2503           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2504           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2505 #if USE_DEBUGGER
2506           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2507 #endif
2508         }
2509         if (__kmp_tasking_mode != tskm_immediate_exec) {
2510           // Synchronize thread's task state
2511           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2512         }
2513       }
2514     }
2515 
2516 #if OMPT_SUPPORT
2517     if (ompt_enabled.enabled) {
2518       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2519                       codeptr);
2520     }
2521 #endif
2522 
2523     return;
2524   }
2525 #endif /* OMP_40_ENABLED */
2526 
2527   /* do cleanup and restore the parent team */
2528   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2529   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2530 
2531   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2532 
2533   /* jc: The following lock has instructions with REL and ACQ semantics,
2534      separating the parallel user code called in this parallel region
2535      from the serial user code called after this function returns. */
2536   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2537 
2538 #if OMP_40_ENABLED
2539   if (!master_th->th.th_teams_microtask ||
2540       team->t.t_level > master_th->th.th_teams_level)
2541 #endif /* OMP_40_ENABLED */
2542   {
2543     /* Decrement our nested depth level */
2544     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2545   }
2546   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2547 
2548 #if OMPT_SUPPORT
2549   if (ompt_enabled.enabled) {
2550     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2551     if (ompt_enabled.ompt_callback_implicit_task) {
2552       int ompt_team_size = team->t.t_nproc;
2553       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2554           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2555           OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2556     }
2557 
2558     task_info->frame.exit_frame = ompt_data_none;
2559     task_info->task_data = ompt_data_none;
2560   }
2561 #endif
2562 
2563   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2564                 master_th, team));
2565   __kmp_pop_current_task_from_thread(master_th);
2566 
2567 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2568   // Restore master thread's partition.
2569   master_th->th.th_first_place = team->t.t_first_place;
2570   master_th->th.th_last_place = team->t.t_last_place;
2571 #endif /* OMP_40_ENABLED */
2572 #if OMP_50_ENABLED
2573   master_th->th.th_def_allocator = team->t.t_def_allocator;
2574 #endif
2575 
2576   updateHWFPControl(team);
2577 
2578   if (root->r.r_active != master_active)
2579     root->r.r_active = master_active;
2580 
2581   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2582                             master_th)); // this will free worker threads
2583 
2584   /* this race was fun to find. make sure the following is in the critical
2585      region otherwise assertions may fail occasionally since the old team may be
2586      reallocated and the hierarchy appears inconsistent. it is actually safe to
2587      run and won't cause any bugs, but will cause those assertion failures. it's
2588      only one deref&assign so might as well put this in the critical region */
2589   master_th->th.th_team = parent_team;
2590   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2591   master_th->th.th_team_master = parent_team->t.t_threads[0];
2592   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2593 
2594   /* restore serialized team, if need be */
2595   if (parent_team->t.t_serialized &&
2596       parent_team != master_th->th.th_serial_team &&
2597       parent_team != root->r.r_root_team) {
2598     __kmp_free_team(root,
2599                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2600     master_th->th.th_serial_team = parent_team;
2601   }
2602 
2603   if (__kmp_tasking_mode != tskm_immediate_exec) {
2604     if (master_th->th.th_task_state_top >
2605         0) { // Restore task state from memo stack
2606       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2607       // Remember master's state if we re-use this nested hot team
2608       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2609           master_th->th.th_task_state;
2610       --master_th->th.th_task_state_top; // pop
2611       // Now restore state at this level
2612       master_th->th.th_task_state =
2613           master_th->th
2614               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2615     }
2616     // Copy the task team from the parent team to the master thread
2617     master_th->th.th_task_team =
2618         parent_team->t.t_task_team[master_th->th.th_task_state];
2619     KA_TRACE(20,
2620              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2621               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2622               parent_team));
2623   }
2624 
2625   // TODO: GEH - cannot do this assertion because root thread not set up as
2626   // executing
2627   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2628   master_th->th.th_current_task->td_flags.executing = 1;
2629 
2630   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2631 
2632 #if OMPT_SUPPORT
2633   if (ompt_enabled.enabled) {
2634     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2635                     codeptr);
2636   }
2637 #endif
2638 
2639   KMP_MB();
2640   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2641 }
2642 
2643 /* Check whether we should push an internal control record onto the
2644    serial team stack.  If so, do it.  */
2645 void __kmp_save_internal_controls(kmp_info_t *thread) {
2646 
2647   if (thread->th.th_team != thread->th.th_serial_team) {
2648     return;
2649   }
2650   if (thread->th.th_team->t.t_serialized > 1) {
2651     int push = 0;
2652 
2653     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2654       push = 1;
2655     } else {
2656       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2657           thread->th.th_team->t.t_serialized) {
2658         push = 1;
2659       }
2660     }
2661     if (push) { /* push a record on the serial team's stack */
2662       kmp_internal_control_t *control =
2663           (kmp_internal_control_t *)__kmp_allocate(
2664               sizeof(kmp_internal_control_t));
2665 
2666       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2667 
2668       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2669 
2670       control->next = thread->th.th_team->t.t_control_stack_top;
2671       thread->th.th_team->t.t_control_stack_top = control;
2672     }
2673   }
2674 }
2675 
2676 /* Changes set_nproc */
2677 void __kmp_set_num_threads(int new_nth, int gtid) {
2678   kmp_info_t *thread;
2679   kmp_root_t *root;
2680 
2681   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2682   KMP_DEBUG_ASSERT(__kmp_init_serial);
2683 
2684   if (new_nth < 1)
2685     new_nth = 1;
2686   else if (new_nth > __kmp_max_nth)
2687     new_nth = __kmp_max_nth;
2688 
2689   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2690   thread = __kmp_threads[gtid];
2691   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2692     return; // nothing to do
2693 
2694   __kmp_save_internal_controls(thread);
2695 
2696   set__nproc(thread, new_nth);
2697 
2698   // If this omp_set_num_threads() call will cause the hot team size to be
2699   // reduced (in the absence of a num_threads clause), then reduce it now,
2700   // rather than waiting for the next parallel region.
2701   root = thread->th.th_root;
2702   if (__kmp_init_parallel && (!root->r.r_active) &&
2703       (root->r.r_hot_team->t.t_nproc > new_nth)
2704 #if KMP_NESTED_HOT_TEAMS
2705       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2706 #endif
2707       ) {
2708     kmp_team_t *hot_team = root->r.r_hot_team;
2709     int f;
2710 
2711     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2712 
2713     // Release the extra threads we don't need any more.
2714     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2715       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2716       if (__kmp_tasking_mode != tskm_immediate_exec) {
2717         // When decreasing team size, threads no longer in the team should unref
2718         // task team.
2719         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2720       }
2721       __kmp_free_thread(hot_team->t.t_threads[f]);
2722       hot_team->t.t_threads[f] = NULL;
2723     }
2724     hot_team->t.t_nproc = new_nth;
2725 #if KMP_NESTED_HOT_TEAMS
2726     if (thread->th.th_hot_teams) {
2727       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2728       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2729     }
2730 #endif
2731 
2732     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2733 
2734     // Update the t_nproc field in the threads that are still active.
2735     for (f = 0; f < new_nth; f++) {
2736       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2737       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2738     }
2739     // Special flag in case omp_set_num_threads() call
2740     hot_team->t.t_size_changed = -1;
2741   }
2742 }
2743 
2744 /* Changes max_active_levels */
2745 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2746   kmp_info_t *thread;
2747 
2748   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2749                 "%d = (%d)\n",
2750                 gtid, max_active_levels));
2751   KMP_DEBUG_ASSERT(__kmp_init_serial);
2752 
2753   // validate max_active_levels
2754   if (max_active_levels < 0) {
2755     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2756     // We ignore this call if the user has specified a negative value.
2757     // The current setting won't be changed. The last valid setting will be
2758     // used. A warning will be issued (if warnings are allowed as controlled by
2759     // the KMP_WARNINGS env var).
2760     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2761                   "max_active_levels for thread %d = (%d)\n",
2762                   gtid, max_active_levels));
2763     return;
2764   }
2765   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2766     // it's OK, the max_active_levels is within the valid range: [ 0;
2767     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2768     // We allow a zero value. (implementation defined behavior)
2769   } else {
2770     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2771                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2772     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2773     // Current upper limit is MAX_INT. (implementation defined behavior)
2774     // If the input exceeds the upper limit, we correct the input to be the
2775     // upper limit. (implementation defined behavior)
2776     // Actually, the flow should never get here until we use MAX_INT limit.
2777   }
2778   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2779                 "max_active_levels for thread %d = (%d)\n",
2780                 gtid, max_active_levels));
2781 
2782   thread = __kmp_threads[gtid];
2783 
2784   __kmp_save_internal_controls(thread);
2785 
2786   set__max_active_levels(thread, max_active_levels);
2787 }
2788 
2789 /* Gets max_active_levels */
2790 int __kmp_get_max_active_levels(int gtid) {
2791   kmp_info_t *thread;
2792 
2793   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2794   KMP_DEBUG_ASSERT(__kmp_init_serial);
2795 
2796   thread = __kmp_threads[gtid];
2797   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2798   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2799                 "curtask_maxaclevel=%d\n",
2800                 gtid, thread->th.th_current_task,
2801                 thread->th.th_current_task->td_icvs.max_active_levels));
2802   return thread->th.th_current_task->td_icvs.max_active_levels;
2803 }
2804 
2805 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2806 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2807   kmp_info_t *thread;
2808   //    kmp_team_t *team;
2809 
2810   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2811                 gtid, (int)kind, chunk));
2812   KMP_DEBUG_ASSERT(__kmp_init_serial);
2813 
2814   // Check if the kind parameter is valid, correct if needed.
2815   // Valid parameters should fit in one of two intervals - standard or extended:
2816   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2817   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2818   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2819       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2820     // TODO: Hint needs attention in case we change the default schedule.
2821     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2822               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2823               __kmp_msg_null);
2824     kind = kmp_sched_default;
2825     chunk = 0; // ignore chunk value in case of bad kind
2826   }
2827 
2828   thread = __kmp_threads[gtid];
2829 
2830   __kmp_save_internal_controls(thread);
2831 
2832   if (kind < kmp_sched_upper_std) {
2833     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2834       // differ static chunked vs. unchunked:  chunk should be invalid to
2835       // indicate unchunked schedule (which is the default)
2836       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2837     } else {
2838       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2839           __kmp_sch_map[kind - kmp_sched_lower - 1];
2840     }
2841   } else {
2842     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2843     //    kmp_sched_lower - 2 ];
2844     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2845         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2846                       kmp_sched_lower - 2];
2847   }
2848   if (kind == kmp_sched_auto || chunk < 1) {
2849     // ignore parameter chunk for schedule auto
2850     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2851   } else {
2852     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2853   }
2854 }
2855 
2856 /* Gets def_sched_var ICV values */
2857 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2858   kmp_info_t *thread;
2859   enum sched_type th_type;
2860 
2861   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2862   KMP_DEBUG_ASSERT(__kmp_init_serial);
2863 
2864   thread = __kmp_threads[gtid];
2865 
2866   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2867 
2868   switch (th_type) {
2869   case kmp_sch_static:
2870   case kmp_sch_static_greedy:
2871   case kmp_sch_static_balanced:
2872     *kind = kmp_sched_static;
2873     *chunk = 0; // chunk was not set, try to show this fact via zero value
2874     return;
2875   case kmp_sch_static_chunked:
2876     *kind = kmp_sched_static;
2877     break;
2878   case kmp_sch_dynamic_chunked:
2879     *kind = kmp_sched_dynamic;
2880     break;
2881   case kmp_sch_guided_chunked:
2882   case kmp_sch_guided_iterative_chunked:
2883   case kmp_sch_guided_analytical_chunked:
2884     *kind = kmp_sched_guided;
2885     break;
2886   case kmp_sch_auto:
2887     *kind = kmp_sched_auto;
2888     break;
2889   case kmp_sch_trapezoidal:
2890     *kind = kmp_sched_trapezoidal;
2891     break;
2892 #if KMP_STATIC_STEAL_ENABLED
2893   case kmp_sch_static_steal:
2894     *kind = kmp_sched_static_steal;
2895     break;
2896 #endif
2897   default:
2898     KMP_FATAL(UnknownSchedulingType, th_type);
2899   }
2900 
2901   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2902 }
2903 
2904 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2905 
2906   int ii, dd;
2907   kmp_team_t *team;
2908   kmp_info_t *thr;
2909 
2910   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2911   KMP_DEBUG_ASSERT(__kmp_init_serial);
2912 
2913   // validate level
2914   if (level == 0)
2915     return 0;
2916   if (level < 0)
2917     return -1;
2918   thr = __kmp_threads[gtid];
2919   team = thr->th.th_team;
2920   ii = team->t.t_level;
2921   if (level > ii)
2922     return -1;
2923 
2924 #if OMP_40_ENABLED
2925   if (thr->th.th_teams_microtask) {
2926     // AC: we are in teams region where multiple nested teams have same level
2927     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2928     if (level <=
2929         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2930       KMP_DEBUG_ASSERT(ii >= tlevel);
2931       // AC: As we need to pass by the teams league, we need to artificially
2932       // increase ii
2933       if (ii == tlevel) {
2934         ii += 2; // three teams have same level
2935       } else {
2936         ii++; // two teams have same level
2937       }
2938     }
2939   }
2940 #endif
2941 
2942   if (ii == level)
2943     return __kmp_tid_from_gtid(gtid);
2944 
2945   dd = team->t.t_serialized;
2946   level++;
2947   while (ii > level) {
2948     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2949     }
2950     if ((team->t.t_serialized) && (!dd)) {
2951       team = team->t.t_parent;
2952       continue;
2953     }
2954     if (ii > level) {
2955       team = team->t.t_parent;
2956       dd = team->t.t_serialized;
2957       ii--;
2958     }
2959   }
2960 
2961   return (dd > 1) ? (0) : (team->t.t_master_tid);
2962 }
2963 
2964 int __kmp_get_team_size(int gtid, int level) {
2965 
2966   int ii, dd;
2967   kmp_team_t *team;
2968   kmp_info_t *thr;
2969 
2970   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2971   KMP_DEBUG_ASSERT(__kmp_init_serial);
2972 
2973   // validate level
2974   if (level == 0)
2975     return 1;
2976   if (level < 0)
2977     return -1;
2978   thr = __kmp_threads[gtid];
2979   team = thr->th.th_team;
2980   ii = team->t.t_level;
2981   if (level > ii)
2982     return -1;
2983 
2984 #if OMP_40_ENABLED
2985   if (thr->th.th_teams_microtask) {
2986     // AC: we are in teams region where multiple nested teams have same level
2987     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2988     if (level <=
2989         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2990       KMP_DEBUG_ASSERT(ii >= tlevel);
2991       // AC: As we need to pass by the teams league, we need to artificially
2992       // increase ii
2993       if (ii == tlevel) {
2994         ii += 2; // three teams have same level
2995       } else {
2996         ii++; // two teams have same level
2997       }
2998     }
2999   }
3000 #endif
3001 
3002   while (ii > level) {
3003     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3004     }
3005     if (team->t.t_serialized && (!dd)) {
3006       team = team->t.t_parent;
3007       continue;
3008     }
3009     if (ii > level) {
3010       team = team->t.t_parent;
3011       ii--;
3012     }
3013   }
3014 
3015   return team->t.t_nproc;
3016 }
3017 
3018 kmp_r_sched_t __kmp_get_schedule_global() {
3019   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3020   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3021   // independently. So one can get the updated schedule here.
3022 
3023   kmp_r_sched_t r_sched;
3024 
3025   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3026   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3027   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3028   // different roots (even in OMP 2.5)
3029   if (__kmp_sched == kmp_sch_static) {
3030     // replace STATIC with more detailed schedule (balanced or greedy)
3031     r_sched.r_sched_type = __kmp_static;
3032   } else if (__kmp_sched == kmp_sch_guided_chunked) {
3033     // replace GUIDED with more detailed schedule (iterative or analytical)
3034     r_sched.r_sched_type = __kmp_guided;
3035   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3036     r_sched.r_sched_type = __kmp_sched;
3037   }
3038 
3039   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3040     // __kmp_chunk may be wrong here (if it was not ever set)
3041     r_sched.chunk = KMP_DEFAULT_CHUNK;
3042   } else {
3043     r_sched.chunk = __kmp_chunk;
3044   }
3045 
3046   return r_sched;
3047 }
3048 
3049 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3050    at least argc number of *t_argv entries for the requested team. */
3051 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3052 
3053   KMP_DEBUG_ASSERT(team);
3054   if (!realloc || argc > team->t.t_max_argc) {
3055 
3056     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3057                    "current entries=%d\n",
3058                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3059     /* if previously allocated heap space for args, free them */
3060     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3061       __kmp_free((void *)team->t.t_argv);
3062 
3063     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3064       /* use unused space in the cache line for arguments */
3065       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3066       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3067                      "argv entries\n",
3068                      team->t.t_id, team->t.t_max_argc));
3069       team->t.t_argv = &team->t.t_inline_argv[0];
3070       if (__kmp_storage_map) {
3071         __kmp_print_storage_map_gtid(
3072             -1, &team->t.t_inline_argv[0],
3073             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3074             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3075             team->t.t_id);
3076       }
3077     } else {
3078       /* allocate space for arguments in the heap */
3079       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3080                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3081                                : 2 * argc;
3082       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3083                      "argv entries\n",
3084                      team->t.t_id, team->t.t_max_argc));
3085       team->t.t_argv =
3086           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3087       if (__kmp_storage_map) {
3088         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3089                                      &team->t.t_argv[team->t.t_max_argc],
3090                                      sizeof(void *) * team->t.t_max_argc,
3091                                      "team_%d.t_argv", team->t.t_id);
3092       }
3093     }
3094   }
3095 }
3096 
3097 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3098   int i;
3099   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3100   team->t.t_threads =
3101       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3102   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3103       sizeof(dispatch_shared_info_t) * num_disp_buff);
3104   team->t.t_dispatch =
3105       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3106   team->t.t_implicit_task_taskdata =
3107       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3108   team->t.t_max_nproc = max_nth;
3109 
3110   /* setup dispatch buffers */
3111   for (i = 0; i < num_disp_buff; ++i) {
3112     team->t.t_disp_buffer[i].buffer_index = i;
3113 #if OMP_45_ENABLED
3114     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3115 #endif
3116   }
3117 }
3118 
3119 static void __kmp_free_team_arrays(kmp_team_t *team) {
3120   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3121   int i;
3122   for (i = 0; i < team->t.t_max_nproc; ++i) {
3123     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3124       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3125       team->t.t_dispatch[i].th_disp_buffer = NULL;
3126     }
3127   }
3128 #if KMP_USE_HIER_SCHED
3129   __kmp_dispatch_free_hierarchies(team);
3130 #endif
3131   __kmp_free(team->t.t_threads);
3132   __kmp_free(team->t.t_disp_buffer);
3133   __kmp_free(team->t.t_dispatch);
3134   __kmp_free(team->t.t_implicit_task_taskdata);
3135   team->t.t_threads = NULL;
3136   team->t.t_disp_buffer = NULL;
3137   team->t.t_dispatch = NULL;
3138   team->t.t_implicit_task_taskdata = 0;
3139 }
3140 
3141 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3142   kmp_info_t **oldThreads = team->t.t_threads;
3143 
3144   __kmp_free(team->t.t_disp_buffer);
3145   __kmp_free(team->t.t_dispatch);
3146   __kmp_free(team->t.t_implicit_task_taskdata);
3147   __kmp_allocate_team_arrays(team, max_nth);
3148 
3149   KMP_MEMCPY(team->t.t_threads, oldThreads,
3150              team->t.t_nproc * sizeof(kmp_info_t *));
3151 
3152   __kmp_free(oldThreads);
3153 }
3154 
3155 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3156 
3157   kmp_r_sched_t r_sched =
3158       __kmp_get_schedule_global(); // get current state of scheduling globals
3159 
3160 #if OMP_40_ENABLED
3161   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3162 #endif /* OMP_40_ENABLED */
3163 
3164   kmp_internal_control_t g_icvs = {
3165     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3166     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3167     // for nested parallelism (per thread)
3168     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3169     // adjustment of threads (per thread)
3170     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3171     // whether blocktime is explicitly set
3172     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3173 #if KMP_USE_MONITOR
3174     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3175 // intervals
3176 #endif
3177     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3178     // next parallel region (per thread)
3179     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3180     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3181     // for max_active_levels
3182     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3183 // {sched,chunk} pair
3184 #if OMP_40_ENABLED
3185     __kmp_nested_proc_bind.bind_types[0],
3186     __kmp_default_device,
3187 #endif /* OMP_40_ENABLED */
3188     NULL // struct kmp_internal_control *next;
3189   };
3190 
3191   return g_icvs;
3192 }
3193 
3194 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3195 
3196   kmp_internal_control_t gx_icvs;
3197   gx_icvs.serial_nesting_level =
3198       0; // probably =team->t.t_serial like in save_inter_controls
3199   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3200   gx_icvs.next = NULL;
3201 
3202   return gx_icvs;
3203 }
3204 
3205 static void __kmp_initialize_root(kmp_root_t *root) {
3206   int f;
3207   kmp_team_t *root_team;
3208   kmp_team_t *hot_team;
3209   int hot_team_max_nth;
3210   kmp_r_sched_t r_sched =
3211       __kmp_get_schedule_global(); // get current state of scheduling globals
3212   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3213   KMP_DEBUG_ASSERT(root);
3214   KMP_ASSERT(!root->r.r_begin);
3215 
3216   /* setup the root state structure */
3217   __kmp_init_lock(&root->r.r_begin_lock);
3218   root->r.r_begin = FALSE;
3219   root->r.r_active = FALSE;
3220   root->r.r_in_parallel = 0;
3221   root->r.r_blocktime = __kmp_dflt_blocktime;
3222   root->r.r_nested = __kmp_dflt_nested;
3223   root->r.r_cg_nthreads = 1;
3224 
3225   /* setup the root team for this task */
3226   /* allocate the root team structure */
3227   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3228 
3229   root_team =
3230       __kmp_allocate_team(root,
3231                           1, // new_nproc
3232                           1, // max_nproc
3233 #if OMPT_SUPPORT
3234                           ompt_data_none, // root parallel id
3235 #endif
3236 #if OMP_40_ENABLED
3237                           __kmp_nested_proc_bind.bind_types[0],
3238 #endif
3239                           &r_icvs,
3240                           0 // argc
3241                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3242                           );
3243 #if USE_DEBUGGER
3244   // Non-NULL value should be assigned to make the debugger display the root
3245   // team.
3246   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3247 #endif
3248 
3249   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3250 
3251   root->r.r_root_team = root_team;
3252   root_team->t.t_control_stack_top = NULL;
3253 
3254   /* initialize root team */
3255   root_team->t.t_threads[0] = NULL;
3256   root_team->t.t_nproc = 1;
3257   root_team->t.t_serialized = 1;
3258   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3259   root_team->t.t_sched.sched = r_sched.sched;
3260   KA_TRACE(
3261       20,
3262       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3263        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3264 
3265   /* setup the  hot team for this task */
3266   /* allocate the hot team structure */
3267   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3268 
3269   hot_team =
3270       __kmp_allocate_team(root,
3271                           1, // new_nproc
3272                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3273 #if OMPT_SUPPORT
3274                           ompt_data_none, // root parallel id
3275 #endif
3276 #if OMP_40_ENABLED
3277                           __kmp_nested_proc_bind.bind_types[0],
3278 #endif
3279                           &r_icvs,
3280                           0 // argc
3281                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3282                           );
3283   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3284 
3285   root->r.r_hot_team = hot_team;
3286   root_team->t.t_control_stack_top = NULL;
3287 
3288   /* first-time initialization */
3289   hot_team->t.t_parent = root_team;
3290 
3291   /* initialize hot team */
3292   hot_team_max_nth = hot_team->t.t_max_nproc;
3293   for (f = 0; f < hot_team_max_nth; ++f) {
3294     hot_team->t.t_threads[f] = NULL;
3295   }
3296   hot_team->t.t_nproc = 1;
3297   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3298   hot_team->t.t_sched.sched = r_sched.sched;
3299   hot_team->t.t_size_changed = 0;
3300 }
3301 
3302 #ifdef KMP_DEBUG
3303 
3304 typedef struct kmp_team_list_item {
3305   kmp_team_p const *entry;
3306   struct kmp_team_list_item *next;
3307 } kmp_team_list_item_t;
3308 typedef kmp_team_list_item_t *kmp_team_list_t;
3309 
3310 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3311     kmp_team_list_t list, // List of teams.
3312     kmp_team_p const *team // Team to add.
3313     ) {
3314 
3315   // List must terminate with item where both entry and next are NULL.
3316   // Team is added to the list only once.
3317   // List is sorted in ascending order by team id.
3318   // Team id is *not* a key.
3319 
3320   kmp_team_list_t l;
3321 
3322   KMP_DEBUG_ASSERT(list != NULL);
3323   if (team == NULL) {
3324     return;
3325   }
3326 
3327   __kmp_print_structure_team_accum(list, team->t.t_parent);
3328   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3329 
3330   // Search list for the team.
3331   l = list;
3332   while (l->next != NULL && l->entry != team) {
3333     l = l->next;
3334   }
3335   if (l->next != NULL) {
3336     return; // Team has been added before, exit.
3337   }
3338 
3339   // Team is not found. Search list again for insertion point.
3340   l = list;
3341   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3342     l = l->next;
3343   }
3344 
3345   // Insert team.
3346   {
3347     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3348         sizeof(kmp_team_list_item_t));
3349     *item = *l;
3350     l->entry = team;
3351     l->next = item;
3352   }
3353 }
3354 
3355 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3356 
3357                                        ) {
3358   __kmp_printf("%s", title);
3359   if (team != NULL) {
3360     __kmp_printf("%2x %p\n", team->t.t_id, team);
3361   } else {
3362     __kmp_printf(" - (nil)\n");
3363   }
3364 }
3365 
3366 static void __kmp_print_structure_thread(char const *title,
3367                                          kmp_info_p const *thread) {
3368   __kmp_printf("%s", title);
3369   if (thread != NULL) {
3370     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3371   } else {
3372     __kmp_printf(" - (nil)\n");
3373   }
3374 }
3375 
3376 void __kmp_print_structure(void) {
3377 
3378   kmp_team_list_t list;
3379 
3380   // Initialize list of teams.
3381   list =
3382       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3383   list->entry = NULL;
3384   list->next = NULL;
3385 
3386   __kmp_printf("\n------------------------------\nGlobal Thread "
3387                "Table\n------------------------------\n");
3388   {
3389     int gtid;
3390     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3391       __kmp_printf("%2d", gtid);
3392       if (__kmp_threads != NULL) {
3393         __kmp_printf(" %p", __kmp_threads[gtid]);
3394       }
3395       if (__kmp_root != NULL) {
3396         __kmp_printf(" %p", __kmp_root[gtid]);
3397       }
3398       __kmp_printf("\n");
3399     }
3400   }
3401 
3402   // Print out __kmp_threads array.
3403   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3404                "----------\n");
3405   if (__kmp_threads != NULL) {
3406     int gtid;
3407     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3408       kmp_info_t const *thread = __kmp_threads[gtid];
3409       if (thread != NULL) {
3410         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3411         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3412         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3413         __kmp_print_structure_team("    Serial Team:  ",
3414                                    thread->th.th_serial_team);
3415         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3416         __kmp_print_structure_thread("    Master:       ",
3417                                      thread->th.th_team_master);
3418         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3419         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3420 #if OMP_40_ENABLED
3421         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3422 #endif
3423         __kmp_print_structure_thread("    Next in pool: ",
3424                                      thread->th.th_next_pool);
3425         __kmp_printf("\n");
3426         __kmp_print_structure_team_accum(list, thread->th.th_team);
3427         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3428       }
3429     }
3430   } else {
3431     __kmp_printf("Threads array is not allocated.\n");
3432   }
3433 
3434   // Print out __kmp_root array.
3435   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3436                "--------\n");
3437   if (__kmp_root != NULL) {
3438     int gtid;
3439     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3440       kmp_root_t const *root = __kmp_root[gtid];
3441       if (root != NULL) {
3442         __kmp_printf("GTID %2d %p:\n", gtid, root);
3443         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3444         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3445         __kmp_print_structure_thread("    Uber Thread:  ",
3446                                      root->r.r_uber_thread);
3447         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3448         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3449         __kmp_printf("    In Parallel:  %2d\n",
3450                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3451         __kmp_printf("\n");
3452         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3453         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3454       }
3455     }
3456   } else {
3457     __kmp_printf("Ubers array is not allocated.\n");
3458   }
3459 
3460   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3461                "--------\n");
3462   while (list->next != NULL) {
3463     kmp_team_p const *team = list->entry;
3464     int i;
3465     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3466     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3467     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3468     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3469     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3470     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3471     for (i = 0; i < team->t.t_nproc; ++i) {
3472       __kmp_printf("    Thread %2d:      ", i);
3473       __kmp_print_structure_thread("", team->t.t_threads[i]);
3474     }
3475     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3476     __kmp_printf("\n");
3477     list = list->next;
3478   }
3479 
3480   // Print out __kmp_thread_pool and __kmp_team_pool.
3481   __kmp_printf("\n------------------------------\nPools\n----------------------"
3482                "--------\n");
3483   __kmp_print_structure_thread("Thread pool:          ",
3484                                CCAST(kmp_info_t *, __kmp_thread_pool));
3485   __kmp_print_structure_team("Team pool:            ",
3486                              CCAST(kmp_team_t *, __kmp_team_pool));
3487   __kmp_printf("\n");
3488 
3489   // Free team list.
3490   while (list != NULL) {
3491     kmp_team_list_item_t *item = list;
3492     list = list->next;
3493     KMP_INTERNAL_FREE(item);
3494   }
3495 }
3496 
3497 #endif
3498 
3499 //---------------------------------------------------------------------------
3500 //  Stuff for per-thread fast random number generator
3501 //  Table of primes
3502 static const unsigned __kmp_primes[] = {
3503     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3504     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3505     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3506     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3507     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3508     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3509     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3510     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3511     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3512     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3513     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3514 
3515 //---------------------------------------------------------------------------
3516 //  __kmp_get_random: Get a random number using a linear congruential method.
3517 unsigned short __kmp_get_random(kmp_info_t *thread) {
3518   unsigned x = thread->th.th_x;
3519   unsigned short r = x >> 16;
3520 
3521   thread->th.th_x = x * thread->th.th_a + 1;
3522 
3523   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3524                 thread->th.th_info.ds.ds_tid, r));
3525 
3526   return r;
3527 }
3528 //--------------------------------------------------------
3529 // __kmp_init_random: Initialize a random number generator
3530 void __kmp_init_random(kmp_info_t *thread) {
3531   unsigned seed = thread->th.th_info.ds.ds_tid;
3532 
3533   thread->th.th_a =
3534       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3535   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3536   KA_TRACE(30,
3537            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3538 }
3539 
3540 #if KMP_OS_WINDOWS
3541 /* reclaim array entries for root threads that are already dead, returns number
3542  * reclaimed */
3543 static int __kmp_reclaim_dead_roots(void) {
3544   int i, r = 0;
3545 
3546   for (i = 0; i < __kmp_threads_capacity; ++i) {
3547     if (KMP_UBER_GTID(i) &&
3548         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3549         !__kmp_root[i]
3550              ->r.r_active) { // AC: reclaim only roots died in non-active state
3551       r += __kmp_unregister_root_other_thread(i);
3552     }
3553   }
3554   return r;
3555 }
3556 #endif
3557 
3558 /* This function attempts to create free entries in __kmp_threads and
3559    __kmp_root, and returns the number of free entries generated.
3560 
3561    For Windows* OS static library, the first mechanism used is to reclaim array
3562    entries for root threads that are already dead.
3563 
3564    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3565    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3566    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3567    threadprivate cache array has been created. Synchronization with
3568    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3569 
3570    After any dead root reclamation, if the clipping value allows array expansion
3571    to result in the generation of a total of nNeed free slots, the function does
3572    that expansion. If not, nothing is done beyond the possible initial root
3573    thread reclamation.
3574 
3575    If any argument is negative, the behavior is undefined. */
3576 static int __kmp_expand_threads(int nNeed) {
3577   int added = 0;
3578   int minimumRequiredCapacity;
3579   int newCapacity;
3580   kmp_info_t **newThreads;
3581   kmp_root_t **newRoot;
3582 
3583 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3584 // resizing __kmp_threads does not need additional protection if foreign
3585 // threads are present
3586 
3587 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3588   /* only for Windows static library */
3589   /* reclaim array entries for root threads that are already dead */
3590   added = __kmp_reclaim_dead_roots();
3591 
3592   if (nNeed) {
3593     nNeed -= added;
3594     if (nNeed < 0)
3595       nNeed = 0;
3596   }
3597 #endif
3598   if (nNeed <= 0)
3599     return added;
3600 
3601   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3602   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3603   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3604   // > __kmp_max_nth in one of two ways:
3605   //
3606   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3607   //    may not be resused by another thread, so we may need to increase
3608   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3609   //
3610   // 2) New foreign root(s) are encountered.  We always register new foreign
3611   //    roots. This may cause a smaller # of threads to be allocated at
3612   //    subsequent parallel regions, but the worker threads hang around (and
3613   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3614   //
3615   // Anyway, that is the reason for moving the check to see if
3616   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3617   // instead of having it performed here. -BB
3618 
3619   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3620 
3621   /* compute expansion headroom to check if we can expand */
3622   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3623     /* possible expansion too small -- give up */
3624     return added;
3625   }
3626   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3627 
3628   newCapacity = __kmp_threads_capacity;
3629   do {
3630     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3631                                                           : __kmp_sys_max_nth;
3632   } while (newCapacity < minimumRequiredCapacity);
3633   newThreads = (kmp_info_t **)__kmp_allocate(
3634       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3635   newRoot =
3636       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3637   KMP_MEMCPY(newThreads, __kmp_threads,
3638              __kmp_threads_capacity * sizeof(kmp_info_t *));
3639   KMP_MEMCPY(newRoot, __kmp_root,
3640              __kmp_threads_capacity * sizeof(kmp_root_t *));
3641 
3642   kmp_info_t **temp_threads = __kmp_threads;
3643   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3644   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3645   __kmp_free(temp_threads);
3646   added += newCapacity - __kmp_threads_capacity;
3647   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3648 
3649   if (newCapacity > __kmp_tp_capacity) {
3650     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3651     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3652       __kmp_threadprivate_resize_cache(newCapacity);
3653     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3654       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3655     }
3656     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3657   }
3658 
3659   return added;
3660 }
3661 
3662 /* Register the current thread as a root thread and obtain our gtid. We must
3663    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3664    thread that calls from __kmp_do_serial_initialize() */
3665 int __kmp_register_root(int initial_thread) {
3666   kmp_info_t *root_thread;
3667   kmp_root_t *root;
3668   int gtid;
3669   int capacity;
3670   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3671   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3672   KMP_MB();
3673 
3674   /* 2007-03-02:
3675      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3676      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3677      work as expected -- it may return false (that means there is at least one
3678      empty slot in __kmp_threads array), but it is possible the only free slot
3679      is #0, which is reserved for initial thread and so cannot be used for this
3680      one. Following code workarounds this bug.
3681 
3682      However, right solution seems to be not reserving slot #0 for initial
3683      thread because:
3684      (1) there is no magic in slot #0,
3685      (2) we cannot detect initial thread reliably (the first thread which does
3686         serial initialization may be not a real initial thread).
3687   */
3688   capacity = __kmp_threads_capacity;
3689   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3690     --capacity;
3691   }
3692 
3693   /* see if there are too many threads */
3694   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3695     if (__kmp_tp_cached) {
3696       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3697                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3698                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3699     } else {
3700       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3701                   __kmp_msg_null);
3702     }
3703   }
3704 
3705   /* find an available thread slot */
3706   /* Don't reassign the zero slot since we need that to only be used by initial
3707      thread */
3708   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3709        gtid++)
3710     ;
3711   KA_TRACE(1,
3712            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3713   KMP_ASSERT(gtid < __kmp_threads_capacity);
3714 
3715   /* update global accounting */
3716   __kmp_all_nth++;
3717   TCW_4(__kmp_nth, __kmp_nth + 1);
3718 
3719   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3720   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3721   if (__kmp_adjust_gtid_mode) {
3722     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3723       if (TCR_4(__kmp_gtid_mode) != 2) {
3724         TCW_4(__kmp_gtid_mode, 2);
3725       }
3726     } else {
3727       if (TCR_4(__kmp_gtid_mode) != 1) {
3728         TCW_4(__kmp_gtid_mode, 1);
3729       }
3730     }
3731   }
3732 
3733 #ifdef KMP_ADJUST_BLOCKTIME
3734   /* Adjust blocktime to zero if necessary            */
3735   /* Middle initialization might not have occurred yet */
3736   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3737     if (__kmp_nth > __kmp_avail_proc) {
3738       __kmp_zero_bt = TRUE;
3739     }
3740   }
3741 #endif /* KMP_ADJUST_BLOCKTIME */
3742 
3743   /* setup this new hierarchy */
3744   if (!(root = __kmp_root[gtid])) {
3745     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3746     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3747   }
3748 
3749 #if KMP_STATS_ENABLED
3750   // Initialize stats as soon as possible (right after gtid assignment).
3751   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3752   __kmp_stats_thread_ptr->startLife();
3753   KMP_SET_THREAD_STATE(SERIAL_REGION);
3754   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3755 #endif
3756   __kmp_initialize_root(root);
3757 
3758   /* setup new root thread structure */
3759   if (root->r.r_uber_thread) {
3760     root_thread = root->r.r_uber_thread;
3761   } else {
3762     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3763     if (__kmp_storage_map) {
3764       __kmp_print_thread_storage_map(root_thread, gtid);
3765     }
3766     root_thread->th.th_info.ds.ds_gtid = gtid;
3767 #if OMPT_SUPPORT
3768     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3769 #endif
3770     root_thread->th.th_root = root;
3771     if (__kmp_env_consistency_check) {
3772       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3773     }
3774 #if USE_FAST_MEMORY
3775     __kmp_initialize_fast_memory(root_thread);
3776 #endif /* USE_FAST_MEMORY */
3777 
3778 #if KMP_USE_BGET
3779     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3780     __kmp_initialize_bget(root_thread);
3781 #endif
3782     __kmp_init_random(root_thread); // Initialize random number generator
3783   }
3784 
3785   /* setup the serial team held in reserve by the root thread */
3786   if (!root_thread->th.th_serial_team) {
3787     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3788     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3789     root_thread->th.th_serial_team =
3790         __kmp_allocate_team(root, 1, 1,
3791 #if OMPT_SUPPORT
3792                             ompt_data_none, // root parallel id
3793 #endif
3794 #if OMP_40_ENABLED
3795                             proc_bind_default,
3796 #endif
3797                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3798   }
3799   KMP_ASSERT(root_thread->th.th_serial_team);
3800   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3801                 root_thread->th.th_serial_team));
3802 
3803   /* drop root_thread into place */
3804   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3805 
3806   root->r.r_root_team->t.t_threads[0] = root_thread;
3807   root->r.r_hot_team->t.t_threads[0] = root_thread;
3808   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3809   // AC: the team created in reserve, not for execution (it is unused for now).
3810   root_thread->th.th_serial_team->t.t_serialized = 0;
3811   root->r.r_uber_thread = root_thread;
3812 
3813   /* initialize the thread, get it ready to go */
3814   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3815   TCW_4(__kmp_init_gtid, TRUE);
3816 
3817   /* prepare the master thread for get_gtid() */
3818   __kmp_gtid_set_specific(gtid);
3819 
3820 #if USE_ITT_BUILD
3821   __kmp_itt_thread_name(gtid);
3822 #endif /* USE_ITT_BUILD */
3823 
3824 #ifdef KMP_TDATA_GTID
3825   __kmp_gtid = gtid;
3826 #endif
3827   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3828   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3829 
3830   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3831                 "plain=%u\n",
3832                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3833                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3834                 KMP_INIT_BARRIER_STATE));
3835   { // Initialize barrier data.
3836     int b;
3837     for (b = 0; b < bs_last_barrier; ++b) {
3838       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3839 #if USE_DEBUGGER
3840       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3841 #endif
3842     }
3843   }
3844   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3845                    KMP_INIT_BARRIER_STATE);
3846 
3847 #if KMP_AFFINITY_SUPPORTED
3848 #if OMP_40_ENABLED
3849   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3850   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3851   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3852   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3853 #endif
3854   if (TCR_4(__kmp_init_middle)) {
3855     __kmp_affinity_set_init_mask(gtid, TRUE);
3856   }
3857 #endif /* KMP_AFFINITY_SUPPORTED */
3858 #if OMP_50_ENABLED
3859   root_thread->th.th_def_allocator = __kmp_def_allocator;
3860   root_thread->th.th_prev_level = 0;
3861   root_thread->th.th_prev_num_threads = 1;
3862 #endif
3863 
3864   __kmp_root_counter++;
3865 
3866 #if OMPT_SUPPORT
3867   if (!initial_thread && ompt_enabled.enabled) {
3868 
3869     kmp_info_t *root_thread = ompt_get_thread();
3870 
3871     ompt_set_thread_state(root_thread, ompt_state_overhead);
3872 
3873     if (ompt_enabled.ompt_callback_thread_begin) {
3874       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3875           ompt_thread_initial, __ompt_get_thread_data_internal());
3876     }
3877     ompt_data_t *task_data;
3878     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3879     if (ompt_enabled.ompt_callback_task_create) {
3880       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3881           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3882       // initial task has nothing to return to
3883     }
3884 
3885     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3886   }
3887 #endif
3888 
3889   KMP_MB();
3890   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3891 
3892   return gtid;
3893 }
3894 
3895 #if KMP_NESTED_HOT_TEAMS
3896 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3897                                 const int max_level) {
3898   int i, n, nth;
3899   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3900   if (!hot_teams || !hot_teams[level].hot_team) {
3901     return 0;
3902   }
3903   KMP_DEBUG_ASSERT(level < max_level);
3904   kmp_team_t *team = hot_teams[level].hot_team;
3905   nth = hot_teams[level].hot_team_nth;
3906   n = nth - 1; // master is not freed
3907   if (level < max_level - 1) {
3908     for (i = 0; i < nth; ++i) {
3909       kmp_info_t *th = team->t.t_threads[i];
3910       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3911       if (i > 0 && th->th.th_hot_teams) {
3912         __kmp_free(th->th.th_hot_teams);
3913         th->th.th_hot_teams = NULL;
3914       }
3915     }
3916   }
3917   __kmp_free_team(root, team, NULL);
3918   return n;
3919 }
3920 #endif
3921 
3922 // Resets a root thread and clear its root and hot teams.
3923 // Returns the number of __kmp_threads entries directly and indirectly freed.
3924 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3925   kmp_team_t *root_team = root->r.r_root_team;
3926   kmp_team_t *hot_team = root->r.r_hot_team;
3927   int n = hot_team->t.t_nproc;
3928   int i;
3929 
3930   KMP_DEBUG_ASSERT(!root->r.r_active);
3931 
3932   root->r.r_root_team = NULL;
3933   root->r.r_hot_team = NULL;
3934   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3935   // before call to __kmp_free_team().
3936   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3937 #if KMP_NESTED_HOT_TEAMS
3938   if (__kmp_hot_teams_max_level >
3939       0) { // need to free nested hot teams and their threads if any
3940     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3941       kmp_info_t *th = hot_team->t.t_threads[i];
3942       if (__kmp_hot_teams_max_level > 1) {
3943         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3944       }
3945       if (th->th.th_hot_teams) {
3946         __kmp_free(th->th.th_hot_teams);
3947         th->th.th_hot_teams = NULL;
3948       }
3949     }
3950   }
3951 #endif
3952   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3953 
3954   // Before we can reap the thread, we need to make certain that all other
3955   // threads in the teams that had this root as ancestor have stopped trying to
3956   // steal tasks.
3957   if (__kmp_tasking_mode != tskm_immediate_exec) {
3958     __kmp_wait_to_unref_task_teams();
3959   }
3960 
3961 #if KMP_OS_WINDOWS
3962   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3963   KA_TRACE(
3964       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3965            "\n",
3966            (LPVOID) & (root->r.r_uber_thread->th),
3967            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3968   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3969 #endif /* KMP_OS_WINDOWS */
3970 
3971 #if OMPT_SUPPORT
3972   if (ompt_enabled.ompt_callback_thread_end) {
3973     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3974         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3975   }
3976 #endif
3977 
3978   TCW_4(__kmp_nth,
3979         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3980   root->r.r_cg_nthreads--;
3981 
3982   __kmp_reap_thread(root->r.r_uber_thread, 1);
3983 
3984   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3985   // of freeing.
3986   root->r.r_uber_thread = NULL;
3987   /* mark root as no longer in use */
3988   root->r.r_begin = FALSE;
3989 
3990   return n;
3991 }
3992 
3993 void __kmp_unregister_root_current_thread(int gtid) {
3994   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3995   /* this lock should be ok, since unregister_root_current_thread is never
3996      called during an abort, only during a normal close. furthermore, if you
3997      have the forkjoin lock, you should never try to get the initz lock */
3998   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3999   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4000     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4001                   "exiting T#%d\n",
4002                   gtid));
4003     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4004     return;
4005   }
4006   kmp_root_t *root = __kmp_root[gtid];
4007 
4008   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4009   KMP_ASSERT(KMP_UBER_GTID(gtid));
4010   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4011   KMP_ASSERT(root->r.r_active == FALSE);
4012 
4013   KMP_MB();
4014 
4015 #if OMP_45_ENABLED
4016   kmp_info_t *thread = __kmp_threads[gtid];
4017   kmp_team_t *team = thread->th.th_team;
4018   kmp_task_team_t *task_team = thread->th.th_task_team;
4019 
4020   // we need to wait for the proxy tasks before finishing the thread
4021   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4022 #if OMPT_SUPPORT
4023     // the runtime is shutting down so we won't report any events
4024     thread->th.ompt_thread_info.state = ompt_state_undefined;
4025 #endif
4026     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4027   }
4028 #endif
4029 
4030   __kmp_reset_root(gtid, root);
4031 
4032   /* free up this thread slot */
4033   __kmp_gtid_set_specific(KMP_GTID_DNE);
4034 #ifdef KMP_TDATA_GTID
4035   __kmp_gtid = KMP_GTID_DNE;
4036 #endif
4037 
4038   KMP_MB();
4039   KC_TRACE(10,
4040            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4041 
4042   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4043 }
4044 
4045 #if KMP_OS_WINDOWS
4046 /* __kmp_forkjoin_lock must be already held
4047    Unregisters a root thread that is not the current thread.  Returns the number
4048    of __kmp_threads entries freed as a result. */
4049 static int __kmp_unregister_root_other_thread(int gtid) {
4050   kmp_root_t *root = __kmp_root[gtid];
4051   int r;
4052 
4053   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4054   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4055   KMP_ASSERT(KMP_UBER_GTID(gtid));
4056   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4057   KMP_ASSERT(root->r.r_active == FALSE);
4058 
4059   r = __kmp_reset_root(gtid, root);
4060   KC_TRACE(10,
4061            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4062   return r;
4063 }
4064 #endif
4065 
4066 #if KMP_DEBUG
4067 void __kmp_task_info() {
4068 
4069   kmp_int32 gtid = __kmp_entry_gtid();
4070   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4071   kmp_info_t *this_thr = __kmp_threads[gtid];
4072   kmp_team_t *steam = this_thr->th.th_serial_team;
4073   kmp_team_t *team = this_thr->th.th_team;
4074 
4075   __kmp_printf(
4076       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4077       "ptask=%p\n",
4078       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4079       team->t.t_implicit_task_taskdata[tid].td_parent);
4080 }
4081 #endif // KMP_DEBUG
4082 
4083 /* TODO optimize with one big memclr, take out what isn't needed, split
4084    responsibility to workers as much as possible, and delay initialization of
4085    features as much as possible  */
4086 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4087                                   int tid, int gtid) {
4088   /* this_thr->th.th_info.ds.ds_gtid is setup in
4089      kmp_allocate_thread/create_worker.
4090      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4091   kmp_info_t *master = team->t.t_threads[0];
4092   KMP_DEBUG_ASSERT(this_thr != NULL);
4093   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4094   KMP_DEBUG_ASSERT(team);
4095   KMP_DEBUG_ASSERT(team->t.t_threads);
4096   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4097   KMP_DEBUG_ASSERT(master);
4098   KMP_DEBUG_ASSERT(master->th.th_root);
4099 
4100   KMP_MB();
4101 
4102   TCW_SYNC_PTR(this_thr->th.th_team, team);
4103 
4104   this_thr->th.th_info.ds.ds_tid = tid;
4105   this_thr->th.th_set_nproc = 0;
4106   if (__kmp_tasking_mode != tskm_immediate_exec)
4107     // When tasking is possible, threads are not safe to reap until they are
4108     // done tasking; this will be set when tasking code is exited in wait
4109     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4110   else // no tasking --> always safe to reap
4111     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4112 #if OMP_40_ENABLED
4113   this_thr->th.th_set_proc_bind = proc_bind_default;
4114 #if KMP_AFFINITY_SUPPORTED
4115   this_thr->th.th_new_place = this_thr->th.th_current_place;
4116 #endif
4117 #endif
4118   this_thr->th.th_root = master->th.th_root;
4119 
4120   /* setup the thread's cache of the team structure */
4121   this_thr->th.th_team_nproc = team->t.t_nproc;
4122   this_thr->th.th_team_master = master;
4123   this_thr->th.th_team_serialized = team->t.t_serialized;
4124   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4125 
4126   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4127 
4128   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4129                 tid, gtid, this_thr, this_thr->th.th_current_task));
4130 
4131   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4132                            team, tid, TRUE);
4133 
4134   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4135                 tid, gtid, this_thr, this_thr->th.th_current_task));
4136   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4137   // __kmp_initialize_team()?
4138 
4139   /* TODO no worksharing in speculative threads */
4140   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4141 
4142   this_thr->th.th_local.this_construct = 0;
4143 
4144   if (!this_thr->th.th_pri_common) {
4145     this_thr->th.th_pri_common =
4146         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4147     if (__kmp_storage_map) {
4148       __kmp_print_storage_map_gtid(
4149           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4150           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4151     }
4152     this_thr->th.th_pri_head = NULL;
4153   }
4154 
4155   /* Initialize dynamic dispatch */
4156   {
4157     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4158     // Use team max_nproc since this will never change for the team.
4159     size_t disp_size =
4160         sizeof(dispatch_private_info_t) *
4161         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4162     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4163                   team->t.t_max_nproc));
4164     KMP_ASSERT(dispatch);
4165     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4166     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4167 
4168     dispatch->th_disp_index = 0;
4169 #if OMP_45_ENABLED
4170     dispatch->th_doacross_buf_idx = 0;
4171 #endif
4172     if (!dispatch->th_disp_buffer) {
4173       dispatch->th_disp_buffer =
4174           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4175 
4176       if (__kmp_storage_map) {
4177         __kmp_print_storage_map_gtid(
4178             gtid, &dispatch->th_disp_buffer[0],
4179             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4180                                           ? 1
4181                                           : __kmp_dispatch_num_buffers],
4182             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4183                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4184             gtid, team->t.t_id, gtid);
4185       }
4186     } else {
4187       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4188     }
4189 
4190     dispatch->th_dispatch_pr_current = 0;
4191     dispatch->th_dispatch_sh_current = 0;
4192 
4193     dispatch->th_deo_fcn = 0; /* ORDERED     */
4194     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4195   }
4196 
4197   this_thr->th.th_next_pool = NULL;
4198 
4199   if (!this_thr->th.th_task_state_memo_stack) {
4200     size_t i;
4201     this_thr->th.th_task_state_memo_stack =
4202         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4203     this_thr->th.th_task_state_top = 0;
4204     this_thr->th.th_task_state_stack_sz = 4;
4205     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4206          ++i) // zero init the stack
4207       this_thr->th.th_task_state_memo_stack[i] = 0;
4208   }
4209 
4210   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4211   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4212 
4213   KMP_MB();
4214 }
4215 
4216 /* allocate a new thread for the requesting team. this is only called from
4217    within a forkjoin critical section. we will first try to get an available
4218    thread from the thread pool. if none is available, we will fork a new one
4219    assuming we are able to create a new one. this should be assured, as the
4220    caller should check on this first. */
4221 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4222                                   int new_tid) {
4223   kmp_team_t *serial_team;
4224   kmp_info_t *new_thr;
4225   int new_gtid;
4226 
4227   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4228   KMP_DEBUG_ASSERT(root && team);
4229 #if !KMP_NESTED_HOT_TEAMS
4230   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4231 #endif
4232   KMP_MB();
4233 
4234   /* first, try to get one from the thread pool */
4235   if (__kmp_thread_pool) {
4236 
4237     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4238     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4239     if (new_thr == __kmp_thread_pool_insert_pt) {
4240       __kmp_thread_pool_insert_pt = NULL;
4241     }
4242     TCW_4(new_thr->th.th_in_pool, FALSE);
4243     // Don't touch th_active_in_pool or th_active.
4244     // The worker thread adjusts those flags as it sleeps/awakens.
4245     __kmp_thread_pool_nth--;
4246 
4247     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4248                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4249     KMP_ASSERT(!new_thr->th.th_team);
4250     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4251     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4252 
4253     /* setup the thread structure */
4254     __kmp_initialize_info(new_thr, team, new_tid,
4255                           new_thr->th.th_info.ds.ds_gtid);
4256     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4257 
4258     TCW_4(__kmp_nth, __kmp_nth + 1);
4259     root->r.r_cg_nthreads++;
4260 
4261     new_thr->th.th_task_state = 0;
4262     new_thr->th.th_task_state_top = 0;
4263     new_thr->th.th_task_state_stack_sz = 4;
4264 
4265 #ifdef KMP_ADJUST_BLOCKTIME
4266     /* Adjust blocktime back to zero if necessary */
4267     /* Middle initialization might not have occurred yet */
4268     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4269       if (__kmp_nth > __kmp_avail_proc) {
4270         __kmp_zero_bt = TRUE;
4271       }
4272     }
4273 #endif /* KMP_ADJUST_BLOCKTIME */
4274 
4275 #if KMP_DEBUG
4276     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4277     // KMP_BARRIER_PARENT_FLAG.
4278     int b;
4279     kmp_balign_t *balign = new_thr->th.th_bar;
4280     for (b = 0; b < bs_last_barrier; ++b)
4281       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4282 #endif
4283 
4284     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4285                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4286 
4287     KMP_MB();
4288     return new_thr;
4289   }
4290 
4291   /* no, well fork a new one */
4292   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4293   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4294 
4295 #if KMP_USE_MONITOR
4296   // If this is the first worker thread the RTL is creating, then also
4297   // launch the monitor thread.  We try to do this as early as possible.
4298   if (!TCR_4(__kmp_init_monitor)) {
4299     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4300     if (!TCR_4(__kmp_init_monitor)) {
4301       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4302       TCW_4(__kmp_init_monitor, 1);
4303       __kmp_create_monitor(&__kmp_monitor);
4304       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4305 #if KMP_OS_WINDOWS
4306       // AC: wait until monitor has started. This is a fix for CQ232808.
4307       // The reason is that if the library is loaded/unloaded in a loop with
4308       // small (parallel) work in between, then there is high probability that
4309       // monitor thread started after the library shutdown. At shutdown it is
4310       // too late to cope with the problem, because when the master is in
4311       // DllMain (process detach) the monitor has no chances to start (it is
4312       // blocked), and master has no means to inform the monitor that the
4313       // library has gone, because all the memory which the monitor can access
4314       // is going to be released/reset.
4315       while (TCR_4(__kmp_init_monitor) < 2) {
4316         KMP_YIELD(TRUE);
4317       }
4318       KF_TRACE(10, ("after monitor thread has started\n"));
4319 #endif
4320     }
4321     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4322   }
4323 #endif
4324 
4325   KMP_MB();
4326   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4327     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4328   }
4329 
4330   /* allocate space for it. */
4331   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4332 
4333   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4334 
4335   if (__kmp_storage_map) {
4336     __kmp_print_thread_storage_map(new_thr, new_gtid);
4337   }
4338 
4339   // add the reserve serialized team, initialized from the team's master thread
4340   {
4341     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4342     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4343     new_thr->th.th_serial_team = serial_team =
4344         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4345 #if OMPT_SUPPORT
4346                                           ompt_data_none, // root parallel id
4347 #endif
4348 #if OMP_40_ENABLED
4349                                           proc_bind_default,
4350 #endif
4351                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4352   }
4353   KMP_ASSERT(serial_team);
4354   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4355   // execution (it is unused for now).
4356   serial_team->t.t_threads[0] = new_thr;
4357   KF_TRACE(10,
4358            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4359             new_thr));
4360 
4361   /* setup the thread structures */
4362   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4363 
4364 #if USE_FAST_MEMORY
4365   __kmp_initialize_fast_memory(new_thr);
4366 #endif /* USE_FAST_MEMORY */
4367 
4368 #if KMP_USE_BGET
4369   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4370   __kmp_initialize_bget(new_thr);
4371 #endif
4372 
4373   __kmp_init_random(new_thr); // Initialize random number generator
4374 
4375   /* Initialize these only once when thread is grabbed for a team allocation */
4376   KA_TRACE(20,
4377            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4378             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4379 
4380   int b;
4381   kmp_balign_t *balign = new_thr->th.th_bar;
4382   for (b = 0; b < bs_last_barrier; ++b) {
4383     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4384     balign[b].bb.team = NULL;
4385     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4386     balign[b].bb.use_oncore_barrier = 0;
4387   }
4388 
4389   new_thr->th.th_spin_here = FALSE;
4390   new_thr->th.th_next_waiting = 0;
4391 #if KMP_OS_UNIX
4392   new_thr->th.th_blocking = false;
4393 #endif
4394 
4395 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4396   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4397   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4398   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4399   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4400 #endif
4401 #if OMP_50_ENABLED
4402   new_thr->th.th_def_allocator = __kmp_def_allocator;
4403   new_thr->th.th_prev_level = 0;
4404   new_thr->th.th_prev_num_threads = 1;
4405 #endif
4406 
4407   TCW_4(new_thr->th.th_in_pool, FALSE);
4408   new_thr->th.th_active_in_pool = FALSE;
4409   TCW_4(new_thr->th.th_active, TRUE);
4410 
4411   /* adjust the global counters */
4412   __kmp_all_nth++;
4413   __kmp_nth++;
4414 
4415   root->r.r_cg_nthreads++;
4416 
4417   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4418   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4419   if (__kmp_adjust_gtid_mode) {
4420     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4421       if (TCR_4(__kmp_gtid_mode) != 2) {
4422         TCW_4(__kmp_gtid_mode, 2);
4423       }
4424     } else {
4425       if (TCR_4(__kmp_gtid_mode) != 1) {
4426         TCW_4(__kmp_gtid_mode, 1);
4427       }
4428     }
4429   }
4430 
4431 #ifdef KMP_ADJUST_BLOCKTIME
4432   /* Adjust blocktime back to zero if necessary       */
4433   /* Middle initialization might not have occurred yet */
4434   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4435     if (__kmp_nth > __kmp_avail_proc) {
4436       __kmp_zero_bt = TRUE;
4437     }
4438   }
4439 #endif /* KMP_ADJUST_BLOCKTIME */
4440 
4441   /* actually fork it and create the new worker thread */
4442   KF_TRACE(
4443       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4444   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4445   KF_TRACE(10,
4446            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4447 
4448   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4449                 new_gtid));
4450   KMP_MB();
4451   return new_thr;
4452 }
4453 
4454 /* Reinitialize team for reuse.
4455    The hot team code calls this case at every fork barrier, so EPCC barrier
4456    test are extremely sensitive to changes in it, esp. writes to the team
4457    struct, which cause a cache invalidation in all threads.
4458    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4459 static void __kmp_reinitialize_team(kmp_team_t *team,
4460                                     kmp_internal_control_t *new_icvs,
4461                                     ident_t *loc) {
4462   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4463                 team->t.t_threads[0], team));
4464   KMP_DEBUG_ASSERT(team && new_icvs);
4465   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4466   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4467 
4468   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4469   // Copy ICVs to the master thread's implicit taskdata
4470   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4471   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4472 
4473   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4474                 team->t.t_threads[0], team));
4475 }
4476 
4477 /* Initialize the team data structure.
4478    This assumes the t_threads and t_max_nproc are already set.
4479    Also, we don't touch the arguments */
4480 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4481                                   kmp_internal_control_t *new_icvs,
4482                                   ident_t *loc) {
4483   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4484 
4485   /* verify */
4486   KMP_DEBUG_ASSERT(team);
4487   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4488   KMP_DEBUG_ASSERT(team->t.t_threads);
4489   KMP_MB();
4490 
4491   team->t.t_master_tid = 0; /* not needed */
4492   /* team->t.t_master_bar;        not needed */
4493   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4494   team->t.t_nproc = new_nproc;
4495 
4496   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4497   team->t.t_next_pool = NULL;
4498   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4499    * up hot team */
4500 
4501   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4502   team->t.t_invoke = NULL; /* not needed */
4503 
4504   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4505   team->t.t_sched.sched = new_icvs->sched.sched;
4506 
4507 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4508   team->t.t_fp_control_saved = FALSE; /* not needed */
4509   team->t.t_x87_fpu_control_word = 0; /* not needed */
4510   team->t.t_mxcsr = 0; /* not needed */
4511 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4512 
4513   team->t.t_construct = 0;
4514 
4515   team->t.t_ordered.dt.t_value = 0;
4516   team->t.t_master_active = FALSE;
4517 
4518   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4519 
4520 #ifdef KMP_DEBUG
4521   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4522 #endif
4523 #if KMP_OS_WINDOWS
4524   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4525 #endif
4526 
4527   team->t.t_control_stack_top = NULL;
4528 
4529   __kmp_reinitialize_team(team, new_icvs, loc);
4530 
4531   KMP_MB();
4532   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4533 }
4534 
4535 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4536 /* Sets full mask for thread and returns old mask, no changes to structures. */
4537 static void
4538 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4539   if (KMP_AFFINITY_CAPABLE()) {
4540     int status;
4541     if (old_mask != NULL) {
4542       status = __kmp_get_system_affinity(old_mask, TRUE);
4543       int error = errno;
4544       if (status != 0) {
4545         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4546                     __kmp_msg_null);
4547       }
4548     }
4549     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4550   }
4551 }
4552 #endif
4553 
4554 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4555 
4556 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4557 // It calculats the worker + master thread's partition based upon the parent
4558 // thread's partition, and binds each worker to a thread in their partition.
4559 // The master thread's partition should already include its current binding.
4560 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4561   // Copy the master thread's place partion to the team struct
4562   kmp_info_t *master_th = team->t.t_threads[0];
4563   KMP_DEBUG_ASSERT(master_th != NULL);
4564   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4565   int first_place = master_th->th.th_first_place;
4566   int last_place = master_th->th.th_last_place;
4567   int masters_place = master_th->th.th_current_place;
4568   team->t.t_first_place = first_place;
4569   team->t.t_last_place = last_place;
4570 
4571   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4572                 "bound to place %d partition = [%d,%d]\n",
4573                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4574                 team->t.t_id, masters_place, first_place, last_place));
4575 
4576   switch (proc_bind) {
4577 
4578   case proc_bind_default:
4579     // serial teams might have the proc_bind policy set to proc_bind_default. It
4580     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4581     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4582     break;
4583 
4584   case proc_bind_master: {
4585     int f;
4586     int n_th = team->t.t_nproc;
4587     for (f = 1; f < n_th; f++) {
4588       kmp_info_t *th = team->t.t_threads[f];
4589       KMP_DEBUG_ASSERT(th != NULL);
4590       th->th.th_first_place = first_place;
4591       th->th.th_last_place = last_place;
4592       th->th.th_new_place = masters_place;
4593 #if OMP_50_ENABLED
4594       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4595           team->t.t_display_affinity != 1) {
4596         team->t.t_display_affinity = 1;
4597       }
4598 #endif
4599 
4600       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4601                      "partition = [%d,%d]\n",
4602                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4603                      f, masters_place, first_place, last_place));
4604     }
4605   } break;
4606 
4607   case proc_bind_close: {
4608     int f;
4609     int n_th = team->t.t_nproc;
4610     int n_places;
4611     if (first_place <= last_place) {
4612       n_places = last_place - first_place + 1;
4613     } else {
4614       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4615     }
4616     if (n_th <= n_places) {
4617       int place = masters_place;
4618       for (f = 1; f < n_th; f++) {
4619         kmp_info_t *th = team->t.t_threads[f];
4620         KMP_DEBUG_ASSERT(th != NULL);
4621 
4622         if (place == last_place) {
4623           place = first_place;
4624         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4625           place = 0;
4626         } else {
4627           place++;
4628         }
4629         th->th.th_first_place = first_place;
4630         th->th.th_last_place = last_place;
4631         th->th.th_new_place = place;
4632 #if OMP_50_ENABLED
4633         if (__kmp_display_affinity && place != th->th.th_current_place &&
4634             team->t.t_display_affinity != 1) {
4635           team->t.t_display_affinity = 1;
4636         }
4637 #endif
4638 
4639         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4640                        "partition = [%d,%d]\n",
4641                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4642                        team->t.t_id, f, place, first_place, last_place));
4643       }
4644     } else {
4645       int S, rem, gap, s_count;
4646       S = n_th / n_places;
4647       s_count = 0;
4648       rem = n_th - (S * n_places);
4649       gap = rem > 0 ? n_places / rem : n_places;
4650       int place = masters_place;
4651       int gap_ct = gap;
4652       for (f = 0; f < n_th; f++) {
4653         kmp_info_t *th = team->t.t_threads[f];
4654         KMP_DEBUG_ASSERT(th != NULL);
4655 
4656         th->th.th_first_place = first_place;
4657         th->th.th_last_place = last_place;
4658         th->th.th_new_place = place;
4659 #if OMP_50_ENABLED
4660         if (__kmp_display_affinity && place != th->th.th_current_place &&
4661             team->t.t_display_affinity != 1) {
4662           team->t.t_display_affinity = 1;
4663         }
4664 #endif
4665         s_count++;
4666 
4667         if ((s_count == S) && rem && (gap_ct == gap)) {
4668           // do nothing, add an extra thread to place on next iteration
4669         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4670           // we added an extra thread to this place; move to next place
4671           if (place == last_place) {
4672             place = first_place;
4673           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4674             place = 0;
4675           } else {
4676             place++;
4677           }
4678           s_count = 0;
4679           gap_ct = 1;
4680           rem--;
4681         } else if (s_count == S) { // place full; don't add extra
4682           if (place == last_place) {
4683             place = first_place;
4684           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4685             place = 0;
4686           } else {
4687             place++;
4688           }
4689           gap_ct++;
4690           s_count = 0;
4691         }
4692 
4693         KA_TRACE(100,
4694                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4695                   "partition = [%d,%d]\n",
4696                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4697                   th->th.th_new_place, first_place, last_place));
4698       }
4699       KMP_DEBUG_ASSERT(place == masters_place);
4700     }
4701   } break;
4702 
4703   case proc_bind_spread: {
4704     int f;
4705     int n_th = team->t.t_nproc;
4706     int n_places;
4707     int thidx;
4708     if (first_place <= last_place) {
4709       n_places = last_place - first_place + 1;
4710     } else {
4711       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4712     }
4713     if (n_th <= n_places) {
4714       int place = -1;
4715 
4716       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4717         int S = n_places / n_th;
4718         int s_count, rem, gap, gap_ct;
4719 
4720         place = masters_place;
4721         rem = n_places - n_th * S;
4722         gap = rem ? n_th / rem : 1;
4723         gap_ct = gap;
4724         thidx = n_th;
4725         if (update_master_only == 1)
4726           thidx = 1;
4727         for (f = 0; f < thidx; f++) {
4728           kmp_info_t *th = team->t.t_threads[f];
4729           KMP_DEBUG_ASSERT(th != NULL);
4730 
4731           th->th.th_first_place = place;
4732           th->th.th_new_place = place;
4733 #if OMP_50_ENABLED
4734           if (__kmp_display_affinity && place != th->th.th_current_place &&
4735               team->t.t_display_affinity != 1) {
4736             team->t.t_display_affinity = 1;
4737           }
4738 #endif
4739           s_count = 1;
4740           while (s_count < S) {
4741             if (place == last_place) {
4742               place = first_place;
4743             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4744               place = 0;
4745             } else {
4746               place++;
4747             }
4748             s_count++;
4749           }
4750           if (rem && (gap_ct == gap)) {
4751             if (place == last_place) {
4752               place = first_place;
4753             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4754               place = 0;
4755             } else {
4756               place++;
4757             }
4758             rem--;
4759             gap_ct = 0;
4760           }
4761           th->th.th_last_place = place;
4762           gap_ct++;
4763 
4764           if (place == last_place) {
4765             place = first_place;
4766           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4767             place = 0;
4768           } else {
4769             place++;
4770           }
4771 
4772           KA_TRACE(100,
4773                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4774                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4775                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4776                     f, th->th.th_new_place, th->th.th_first_place,
4777                     th->th.th_last_place, __kmp_affinity_num_masks));
4778         }
4779       } else {
4780         /* Having uniform space of available computation places I can create
4781            T partitions of round(P/T) size and put threads into the first
4782            place of each partition. */
4783         double current = static_cast<double>(masters_place);
4784         double spacing =
4785             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4786         int first, last;
4787         kmp_info_t *th;
4788 
4789         thidx = n_th + 1;
4790         if (update_master_only == 1)
4791           thidx = 1;
4792         for (f = 0; f < thidx; f++) {
4793           first = static_cast<int>(current);
4794           last = static_cast<int>(current + spacing) - 1;
4795           KMP_DEBUG_ASSERT(last >= first);
4796           if (first >= n_places) {
4797             if (masters_place) {
4798               first -= n_places;
4799               last -= n_places;
4800               if (first == (masters_place + 1)) {
4801                 KMP_DEBUG_ASSERT(f == n_th);
4802                 first--;
4803               }
4804               if (last == masters_place) {
4805                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4806                 last--;
4807               }
4808             } else {
4809               KMP_DEBUG_ASSERT(f == n_th);
4810               first = 0;
4811               last = 0;
4812             }
4813           }
4814           if (last >= n_places) {
4815             last = (n_places - 1);
4816           }
4817           place = first;
4818           current += spacing;
4819           if (f < n_th) {
4820             KMP_DEBUG_ASSERT(0 <= first);
4821             KMP_DEBUG_ASSERT(n_places > first);
4822             KMP_DEBUG_ASSERT(0 <= last);
4823             KMP_DEBUG_ASSERT(n_places > last);
4824             KMP_DEBUG_ASSERT(last_place >= first_place);
4825             th = team->t.t_threads[f];
4826             KMP_DEBUG_ASSERT(th);
4827             th->th.th_first_place = first;
4828             th->th.th_new_place = place;
4829             th->th.th_last_place = last;
4830 #if OMP_50_ENABLED
4831             if (__kmp_display_affinity && place != th->th.th_current_place &&
4832                 team->t.t_display_affinity != 1) {
4833               team->t.t_display_affinity = 1;
4834             }
4835 #endif
4836             KA_TRACE(100,
4837                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4838                       "partition = [%d,%d], spacing = %.4f\n",
4839                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4840                       team->t.t_id, f, th->th.th_new_place,
4841                       th->th.th_first_place, th->th.th_last_place, spacing));
4842           }
4843         }
4844       }
4845       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4846     } else {
4847       int S, rem, gap, s_count;
4848       S = n_th / n_places;
4849       s_count = 0;
4850       rem = n_th - (S * n_places);
4851       gap = rem > 0 ? n_places / rem : n_places;
4852       int place = masters_place;
4853       int gap_ct = gap;
4854       thidx = n_th;
4855       if (update_master_only == 1)
4856         thidx = 1;
4857       for (f = 0; f < thidx; f++) {
4858         kmp_info_t *th = team->t.t_threads[f];
4859         KMP_DEBUG_ASSERT(th != NULL);
4860 
4861         th->th.th_first_place = place;
4862         th->th.th_last_place = place;
4863         th->th.th_new_place = place;
4864 #if OMP_50_ENABLED
4865         if (__kmp_display_affinity && place != th->th.th_current_place &&
4866             team->t.t_display_affinity != 1) {
4867           team->t.t_display_affinity = 1;
4868         }
4869 #endif
4870         s_count++;
4871 
4872         if ((s_count == S) && rem && (gap_ct == gap)) {
4873           // do nothing, add an extra thread to place on next iteration
4874         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4875           // we added an extra thread to this place; move on to next place
4876           if (place == last_place) {
4877             place = first_place;
4878           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4879             place = 0;
4880           } else {
4881             place++;
4882           }
4883           s_count = 0;
4884           gap_ct = 1;
4885           rem--;
4886         } else if (s_count == S) { // place is full; don't add extra thread
4887           if (place == last_place) {
4888             place = first_place;
4889           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4890             place = 0;
4891           } else {
4892             place++;
4893           }
4894           gap_ct++;
4895           s_count = 0;
4896         }
4897 
4898         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4899                        "partition = [%d,%d]\n",
4900                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4901                        team->t.t_id, f, th->th.th_new_place,
4902                        th->th.th_first_place, th->th.th_last_place));
4903       }
4904       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4905     }
4906   } break;
4907 
4908   default:
4909     break;
4910   }
4911 
4912   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4913 }
4914 
4915 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4916 
4917 /* allocate a new team data structure to use.  take one off of the free pool if
4918    available */
4919 kmp_team_t *
4920 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4921 #if OMPT_SUPPORT
4922                     ompt_data_t ompt_parallel_data,
4923 #endif
4924 #if OMP_40_ENABLED
4925                     kmp_proc_bind_t new_proc_bind,
4926 #endif
4927                     kmp_internal_control_t *new_icvs,
4928                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4929   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4930   int f;
4931   kmp_team_t *team;
4932   int use_hot_team = !root->r.r_active;
4933   int level = 0;
4934 
4935   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4936   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4937   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4938   KMP_MB();
4939 
4940 #if KMP_NESTED_HOT_TEAMS
4941   kmp_hot_team_ptr_t *hot_teams;
4942   if (master) {
4943     team = master->th.th_team;
4944     level = team->t.t_active_level;
4945     if (master->th.th_teams_microtask) { // in teams construct?
4946       if (master->th.th_teams_size.nteams > 1 &&
4947           ( // #teams > 1
4948               team->t.t_pkfn ==
4949                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4950               master->th.th_teams_level <
4951                   team->t.t_level)) { // or nested parallel inside the teams
4952         ++level; // not increment if #teams==1, or for outer fork of the teams;
4953         // increment otherwise
4954       }
4955     }
4956     hot_teams = master->th.th_hot_teams;
4957     if (level < __kmp_hot_teams_max_level && hot_teams &&
4958         hot_teams[level]
4959             .hot_team) { // hot team has already been allocated for given level
4960       use_hot_team = 1;
4961     } else {
4962       use_hot_team = 0;
4963     }
4964   }
4965 #endif
4966   // Optimization to use a "hot" team
4967   if (use_hot_team && new_nproc > 1) {
4968     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4969 #if KMP_NESTED_HOT_TEAMS
4970     team = hot_teams[level].hot_team;
4971 #else
4972     team = root->r.r_hot_team;
4973 #endif
4974 #if KMP_DEBUG
4975     if (__kmp_tasking_mode != tskm_immediate_exec) {
4976       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4977                     "task_team[1] = %p before reinit\n",
4978                     team->t.t_task_team[0], team->t.t_task_team[1]));
4979     }
4980 #endif
4981 
4982     // Has the number of threads changed?
4983     /* Let's assume the most common case is that the number of threads is
4984        unchanged, and put that case first. */
4985     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4986       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4987       // This case can mean that omp_set_num_threads() was called and the hot
4988       // team size was already reduced, so we check the special flag
4989       if (team->t.t_size_changed == -1) {
4990         team->t.t_size_changed = 1;
4991       } else {
4992         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4993       }
4994 
4995       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4996       kmp_r_sched_t new_sched = new_icvs->sched;
4997       // set master's schedule as new run-time schedule
4998       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4999 
5000       __kmp_reinitialize_team(team, new_icvs,
5001                               root->r.r_uber_thread->th.th_ident);
5002 
5003       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5004                     team->t.t_threads[0], team));
5005       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5006 
5007 #if OMP_40_ENABLED
5008 #if KMP_AFFINITY_SUPPORTED
5009       if ((team->t.t_size_changed == 0) &&
5010           (team->t.t_proc_bind == new_proc_bind)) {
5011         if (new_proc_bind == proc_bind_spread) {
5012           __kmp_partition_places(
5013               team, 1); // add flag to update only master for spread
5014         }
5015         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5016                        "proc_bind = %d, partition = [%d,%d]\n",
5017                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5018                        team->t.t_last_place));
5019       } else {
5020         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5021         __kmp_partition_places(team);
5022       }
5023 #else
5024       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5025 #endif /* KMP_AFFINITY_SUPPORTED */
5026 #endif /* OMP_40_ENABLED */
5027     } else if (team->t.t_nproc > new_nproc) {
5028       KA_TRACE(20,
5029                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5030                 new_nproc));
5031 
5032       team->t.t_size_changed = 1;
5033 #if KMP_NESTED_HOT_TEAMS
5034       if (__kmp_hot_teams_mode == 0) {
5035         // AC: saved number of threads should correspond to team's value in this
5036         // mode, can be bigger in mode 1, when hot team has threads in reserve
5037         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5038         hot_teams[level].hot_team_nth = new_nproc;
5039 #endif // KMP_NESTED_HOT_TEAMS
5040         /* release the extra threads we don't need any more */
5041         for (f = new_nproc; f < team->t.t_nproc; f++) {
5042           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5043           if (__kmp_tasking_mode != tskm_immediate_exec) {
5044             // When decreasing team size, threads no longer in the team should
5045             // unref task team.
5046             team->t.t_threads[f]->th.th_task_team = NULL;
5047           }
5048           __kmp_free_thread(team->t.t_threads[f]);
5049           team->t.t_threads[f] = NULL;
5050         }
5051 #if KMP_NESTED_HOT_TEAMS
5052       } // (__kmp_hot_teams_mode == 0)
5053       else {
5054         // When keeping extra threads in team, switch threads to wait on own
5055         // b_go flag
5056         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5057           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5058           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5059           for (int b = 0; b < bs_last_barrier; ++b) {
5060             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5061               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5062             }
5063             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5064           }
5065         }
5066       }
5067 #endif // KMP_NESTED_HOT_TEAMS
5068       team->t.t_nproc = new_nproc;
5069       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5070       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5071       __kmp_reinitialize_team(team, new_icvs,
5072                               root->r.r_uber_thread->th.th_ident);
5073 
5074       /* update the remaining threads */
5075       for (f = 0; f < new_nproc; ++f) {
5076         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5077       }
5078       // restore the current task state of the master thread: should be the
5079       // implicit task
5080       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5081                     team->t.t_threads[0], team));
5082 
5083       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5084 
5085 #ifdef KMP_DEBUG
5086       for (f = 0; f < team->t.t_nproc; f++) {
5087         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5088                          team->t.t_threads[f]->th.th_team_nproc ==
5089                              team->t.t_nproc);
5090       }
5091 #endif
5092 
5093 #if OMP_40_ENABLED
5094       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5095 #if KMP_AFFINITY_SUPPORTED
5096       __kmp_partition_places(team);
5097 #endif
5098 #endif
5099     } else { // team->t.t_nproc < new_nproc
5100 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5101       kmp_affin_mask_t *old_mask;
5102       if (KMP_AFFINITY_CAPABLE()) {
5103         KMP_CPU_ALLOC(old_mask);
5104       }
5105 #endif
5106 
5107       KA_TRACE(20,
5108                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5109                 new_nproc));
5110 
5111       team->t.t_size_changed = 1;
5112 
5113 #if KMP_NESTED_HOT_TEAMS
5114       int avail_threads = hot_teams[level].hot_team_nth;
5115       if (new_nproc < avail_threads)
5116         avail_threads = new_nproc;
5117       kmp_info_t **other_threads = team->t.t_threads;
5118       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5119         // Adjust barrier data of reserved threads (if any) of the team
5120         // Other data will be set in __kmp_initialize_info() below.
5121         int b;
5122         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5123         for (b = 0; b < bs_last_barrier; ++b) {
5124           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5125           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5126 #if USE_DEBUGGER
5127           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5128 #endif
5129         }
5130       }
5131       if (hot_teams[level].hot_team_nth >= new_nproc) {
5132         // we have all needed threads in reserve, no need to allocate any
5133         // this only possible in mode 1, cannot have reserved threads in mode 0
5134         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5135         team->t.t_nproc = new_nproc; // just get reserved threads involved
5136       } else {
5137         // we may have some threads in reserve, but not enough
5138         team->t.t_nproc =
5139             hot_teams[level]
5140                 .hot_team_nth; // get reserved threads involved if any
5141         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5142 #endif // KMP_NESTED_HOT_TEAMS
5143         if (team->t.t_max_nproc < new_nproc) {
5144           /* reallocate larger arrays */
5145           __kmp_reallocate_team_arrays(team, new_nproc);
5146           __kmp_reinitialize_team(team, new_icvs, NULL);
5147         }
5148 
5149 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5150         /* Temporarily set full mask for master thread before creation of
5151            workers. The reason is that workers inherit the affinity from master,
5152            so if a lot of workers are created on the single core quickly, they
5153            don't get a chance to set their own affinity for a long time. */
5154         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5155 #endif
5156 
5157         /* allocate new threads for the hot team */
5158         for (f = team->t.t_nproc; f < new_nproc; f++) {
5159           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5160           KMP_DEBUG_ASSERT(new_worker);
5161           team->t.t_threads[f] = new_worker;
5162 
5163           KA_TRACE(20,
5164                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5165                     "join=%llu, plain=%llu\n",
5166                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5167                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5168                     team->t.t_bar[bs_plain_barrier].b_arrived));
5169 
5170           { // Initialize barrier data for new threads.
5171             int b;
5172             kmp_balign_t *balign = new_worker->th.th_bar;
5173             for (b = 0; b < bs_last_barrier; ++b) {
5174               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5175               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5176                                KMP_BARRIER_PARENT_FLAG);
5177 #if USE_DEBUGGER
5178               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5179 #endif
5180             }
5181           }
5182         }
5183 
5184 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5185         if (KMP_AFFINITY_CAPABLE()) {
5186           /* Restore initial master thread's affinity mask */
5187           __kmp_set_system_affinity(old_mask, TRUE);
5188           KMP_CPU_FREE(old_mask);
5189         }
5190 #endif
5191 #if KMP_NESTED_HOT_TEAMS
5192       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5193 #endif // KMP_NESTED_HOT_TEAMS
5194       /* make sure everyone is syncronized */
5195       int old_nproc = team->t.t_nproc; // save old value and use to update only
5196       // new threads below
5197       __kmp_initialize_team(team, new_nproc, new_icvs,
5198                             root->r.r_uber_thread->th.th_ident);
5199 
5200       /* reinitialize the threads */
5201       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5202       for (f = 0; f < team->t.t_nproc; ++f)
5203         __kmp_initialize_info(team->t.t_threads[f], team, f,
5204                               __kmp_gtid_from_tid(f, team));
5205       if (level) { // set th_task_state for new threads in nested hot team
5206         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5207         // only need to set the th_task_state for the new threads. th_task_state
5208         // for master thread will not be accurate until after this in
5209         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5210         // correct value.
5211         for (f = old_nproc; f < team->t.t_nproc; ++f)
5212           team->t.t_threads[f]->th.th_task_state =
5213               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5214       } else { // set th_task_state for new threads in non-nested hot team
5215         int old_state =
5216             team->t.t_threads[0]->th.th_task_state; // copy master's state
5217         for (f = old_nproc; f < team->t.t_nproc; ++f)
5218           team->t.t_threads[f]->th.th_task_state = old_state;
5219       }
5220 
5221 #ifdef KMP_DEBUG
5222       for (f = 0; f < team->t.t_nproc; ++f) {
5223         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5224                          team->t.t_threads[f]->th.th_team_nproc ==
5225                              team->t.t_nproc);
5226       }
5227 #endif
5228 
5229 #if OMP_40_ENABLED
5230       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5231 #if KMP_AFFINITY_SUPPORTED
5232       __kmp_partition_places(team);
5233 #endif
5234 #endif
5235     } // Check changes in number of threads
5236 
5237 #if OMP_40_ENABLED
5238     kmp_info_t *master = team->t.t_threads[0];
5239     if (master->th.th_teams_microtask) {
5240       for (f = 1; f < new_nproc; ++f) {
5241         // propagate teams construct specific info to workers
5242         kmp_info_t *thr = team->t.t_threads[f];
5243         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5244         thr->th.th_teams_level = master->th.th_teams_level;
5245         thr->th.th_teams_size = master->th.th_teams_size;
5246       }
5247     }
5248 #endif /* OMP_40_ENABLED */
5249 #if KMP_NESTED_HOT_TEAMS
5250     if (level) {
5251       // Sync barrier state for nested hot teams, not needed for outermost hot
5252       // team.
5253       for (f = 1; f < new_nproc; ++f) {
5254         kmp_info_t *thr = team->t.t_threads[f];
5255         int b;
5256         kmp_balign_t *balign = thr->th.th_bar;
5257         for (b = 0; b < bs_last_barrier; ++b) {
5258           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5259           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5260 #if USE_DEBUGGER
5261           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5262 #endif
5263         }
5264       }
5265     }
5266 #endif // KMP_NESTED_HOT_TEAMS
5267 
5268     /* reallocate space for arguments if necessary */
5269     __kmp_alloc_argv_entries(argc, team, TRUE);
5270     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5271     // The hot team re-uses the previous task team,
5272     // if untouched during the previous release->gather phase.
5273 
5274     KF_TRACE(10, (" hot_team = %p\n", team));
5275 
5276 #if KMP_DEBUG
5277     if (__kmp_tasking_mode != tskm_immediate_exec) {
5278       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5279                     "task_team[1] = %p after reinit\n",
5280                     team->t.t_task_team[0], team->t.t_task_team[1]));
5281     }
5282 #endif
5283 
5284 #if OMPT_SUPPORT
5285     __ompt_team_assign_id(team, ompt_parallel_data);
5286 #endif
5287 
5288     KMP_MB();
5289 
5290     return team;
5291   }
5292 
5293   /* next, let's try to take one from the team pool */
5294   KMP_MB();
5295   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5296     /* TODO: consider resizing undersized teams instead of reaping them, now
5297        that we have a resizing mechanism */
5298     if (team->t.t_max_nproc >= max_nproc) {
5299       /* take this team from the team pool */
5300       __kmp_team_pool = team->t.t_next_pool;
5301 
5302       /* setup the team for fresh use */
5303       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5304 
5305       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5306                     "task_team[1] %p to NULL\n",
5307                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5308       team->t.t_task_team[0] = NULL;
5309       team->t.t_task_team[1] = NULL;
5310 
5311       /* reallocate space for arguments if necessary */
5312       __kmp_alloc_argv_entries(argc, team, TRUE);
5313       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5314 
5315       KA_TRACE(
5316           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5317                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5318       { // Initialize barrier data.
5319         int b;
5320         for (b = 0; b < bs_last_barrier; ++b) {
5321           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5322 #if USE_DEBUGGER
5323           team->t.t_bar[b].b_master_arrived = 0;
5324           team->t.t_bar[b].b_team_arrived = 0;
5325 #endif
5326         }
5327       }
5328 
5329 #if OMP_40_ENABLED
5330       team->t.t_proc_bind = new_proc_bind;
5331 #endif
5332 
5333       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5334                     team->t.t_id));
5335 
5336 #if OMPT_SUPPORT
5337       __ompt_team_assign_id(team, ompt_parallel_data);
5338 #endif
5339 
5340       KMP_MB();
5341 
5342       return team;
5343     }
5344 
5345     /* reap team if it is too small, then loop back and check the next one */
5346     // not sure if this is wise, but, will be redone during the hot-teams
5347     // rewrite.
5348     /* TODO: Use technique to find the right size hot-team, don't reap them */
5349     team = __kmp_reap_team(team);
5350     __kmp_team_pool = team;
5351   }
5352 
5353   /* nothing available in the pool, no matter, make a new team! */
5354   KMP_MB();
5355   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5356 
5357   /* and set it up */
5358   team->t.t_max_nproc = max_nproc;
5359   /* NOTE well, for some reason allocating one big buffer and dividing it up
5360      seems to really hurt performance a lot on the P4, so, let's not use this */
5361   __kmp_allocate_team_arrays(team, max_nproc);
5362 
5363   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5364   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5365 
5366   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5367                 "%p to NULL\n",
5368                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5369   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5370   // memory, no need to duplicate
5371   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5372   // memory, no need to duplicate
5373 
5374   if (__kmp_storage_map) {
5375     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5376   }
5377 
5378   /* allocate space for arguments */
5379   __kmp_alloc_argv_entries(argc, team, FALSE);
5380   team->t.t_argc = argc;
5381 
5382   KA_TRACE(20,
5383            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5384             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5385   { // Initialize barrier data.
5386     int b;
5387     for (b = 0; b < bs_last_barrier; ++b) {
5388       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5389 #if USE_DEBUGGER
5390       team->t.t_bar[b].b_master_arrived = 0;
5391       team->t.t_bar[b].b_team_arrived = 0;
5392 #endif
5393     }
5394   }
5395 
5396 #if OMP_40_ENABLED
5397   team->t.t_proc_bind = new_proc_bind;
5398 #endif
5399 
5400 #if OMPT_SUPPORT
5401   __ompt_team_assign_id(team, ompt_parallel_data);
5402   team->t.ompt_serialized_team_info = NULL;
5403 #endif
5404 
5405   KMP_MB();
5406 
5407   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5408                 team->t.t_id));
5409 
5410   return team;
5411 }
5412 
5413 /* TODO implement hot-teams at all levels */
5414 /* TODO implement lazy thread release on demand (disband request) */
5415 
5416 /* free the team.  return it to the team pool.  release all the threads
5417  * associated with it */
5418 void __kmp_free_team(kmp_root_t *root,
5419                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5420   int f;
5421   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5422                 team->t.t_id));
5423 
5424   /* verify state */
5425   KMP_DEBUG_ASSERT(root);
5426   KMP_DEBUG_ASSERT(team);
5427   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5428   KMP_DEBUG_ASSERT(team->t.t_threads);
5429 
5430   int use_hot_team = team == root->r.r_hot_team;
5431 #if KMP_NESTED_HOT_TEAMS
5432   int level;
5433   kmp_hot_team_ptr_t *hot_teams;
5434   if (master) {
5435     level = team->t.t_active_level - 1;
5436     if (master->th.th_teams_microtask) { // in teams construct?
5437       if (master->th.th_teams_size.nteams > 1) {
5438         ++level; // level was not increased in teams construct for
5439         // team_of_masters
5440       }
5441       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5442           master->th.th_teams_level == team->t.t_level) {
5443         ++level; // level was not increased in teams construct for
5444         // team_of_workers before the parallel
5445       } // team->t.t_level will be increased inside parallel
5446     }
5447     hot_teams = master->th.th_hot_teams;
5448     if (level < __kmp_hot_teams_max_level) {
5449       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5450       use_hot_team = 1;
5451     }
5452   }
5453 #endif // KMP_NESTED_HOT_TEAMS
5454 
5455   /* team is done working */
5456   TCW_SYNC_PTR(team->t.t_pkfn,
5457                NULL); // Important for Debugging Support Library.
5458 #if KMP_OS_WINDOWS
5459   team->t.t_copyin_counter = 0; // init counter for possible reuse
5460 #endif
5461   // Do not reset pointer to parent team to NULL for hot teams.
5462 
5463   /* if we are non-hot team, release our threads */
5464   if (!use_hot_team) {
5465     if (__kmp_tasking_mode != tskm_immediate_exec) {
5466       // Wait for threads to reach reapable state
5467       for (f = 1; f < team->t.t_nproc; ++f) {
5468         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5469         kmp_info_t *th = team->t.t_threads[f];
5470         volatile kmp_uint32 *state = &th->th.th_reap_state;
5471         while (*state != KMP_SAFE_TO_REAP) {
5472 #if KMP_OS_WINDOWS
5473           // On Windows a thread can be killed at any time, check this
5474           DWORD ecode;
5475           if (!__kmp_is_thread_alive(th, &ecode)) {
5476             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5477             break;
5478           }
5479 #endif
5480           // first check if thread is sleeping
5481           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5482           if (fl.is_sleeping())
5483             fl.resume(__kmp_gtid_from_thread(th));
5484           KMP_CPU_PAUSE();
5485         }
5486       }
5487 
5488       // Delete task teams
5489       int tt_idx;
5490       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5491         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5492         if (task_team != NULL) {
5493           for (f = 0; f < team->t.t_nproc;
5494                ++f) { // Have all threads unref task teams
5495             team->t.t_threads[f]->th.th_task_team = NULL;
5496           }
5497           KA_TRACE(
5498               20,
5499               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5500                __kmp_get_gtid(), task_team, team->t.t_id));
5501 #if KMP_NESTED_HOT_TEAMS
5502           __kmp_free_task_team(master, task_team);
5503 #endif
5504           team->t.t_task_team[tt_idx] = NULL;
5505         }
5506       }
5507     }
5508 
5509     // Reset pointer to parent team only for non-hot teams.
5510     team->t.t_parent = NULL;
5511     team->t.t_level = 0;
5512     team->t.t_active_level = 0;
5513 
5514     /* free the worker threads */
5515     for (f = 1; f < team->t.t_nproc; ++f) {
5516       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5517       __kmp_free_thread(team->t.t_threads[f]);
5518       team->t.t_threads[f] = NULL;
5519     }
5520 
5521     /* put the team back in the team pool */
5522     /* TODO limit size of team pool, call reap_team if pool too large */
5523     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5524     __kmp_team_pool = (volatile kmp_team_t *)team;
5525   }
5526 
5527   KMP_MB();
5528 }
5529 
5530 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5531 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5532   kmp_team_t *next_pool = team->t.t_next_pool;
5533 
5534   KMP_DEBUG_ASSERT(team);
5535   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5536   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5537   KMP_DEBUG_ASSERT(team->t.t_threads);
5538   KMP_DEBUG_ASSERT(team->t.t_argv);
5539 
5540   /* TODO clean the threads that are a part of this? */
5541 
5542   /* free stuff */
5543   __kmp_free_team_arrays(team);
5544   if (team->t.t_argv != &team->t.t_inline_argv[0])
5545     __kmp_free((void *)team->t.t_argv);
5546   __kmp_free(team);
5547 
5548   KMP_MB();
5549   return next_pool;
5550 }
5551 
5552 // Free the thread.  Don't reap it, just place it on the pool of available
5553 // threads.
5554 //
5555 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5556 // binding for the affinity mechanism to be useful.
5557 //
5558 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5559 // However, we want to avoid a potential performance problem by always
5560 // scanning through the list to find the correct point at which to insert
5561 // the thread (potential N**2 behavior).  To do this we keep track of the
5562 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5563 // With single-level parallelism, threads will always be added to the tail
5564 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5565 // parallelism, all bets are off and we may need to scan through the entire
5566 // free list.
5567 //
5568 // This change also has a potentially large performance benefit, for some
5569 // applications.  Previously, as threads were freed from the hot team, they
5570 // would be placed back on the free list in inverse order.  If the hot team
5571 // grew back to it's original size, then the freed thread would be placed
5572 // back on the hot team in reverse order.  This could cause bad cache
5573 // locality problems on programs where the size of the hot team regularly
5574 // grew and shrunk.
5575 //
5576 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5577 void __kmp_free_thread(kmp_info_t *this_th) {
5578   int gtid;
5579   kmp_info_t **scan;
5580   kmp_root_t *root = this_th->th.th_root;
5581 
5582   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5583                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5584 
5585   KMP_DEBUG_ASSERT(this_th);
5586 
5587   // When moving thread to pool, switch thread to wait on own b_go flag, and
5588   // uninitialized (NULL team).
5589   int b;
5590   kmp_balign_t *balign = this_th->th.th_bar;
5591   for (b = 0; b < bs_last_barrier; ++b) {
5592     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5593       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5594     balign[b].bb.team = NULL;
5595     balign[b].bb.leaf_kids = 0;
5596   }
5597   this_th->th.th_task_state = 0;
5598   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5599 
5600   /* put thread back on the free pool */
5601   TCW_PTR(this_th->th.th_team, NULL);
5602   TCW_PTR(this_th->th.th_root, NULL);
5603   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5604 
5605   /* If the implicit task assigned to this thread can be used by other threads
5606    * -> multiple threads can share the data and try to free the task at
5607    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5608    * with higher probability when hot team is disabled but can occurs even when
5609    * the hot team is enabled */
5610   __kmp_free_implicit_task(this_th);
5611   this_th->th.th_current_task = NULL;
5612 
5613   // If the __kmp_thread_pool_insert_pt is already past the new insert
5614   // point, then we need to re-scan the entire list.
5615   gtid = this_th->th.th_info.ds.ds_gtid;
5616   if (__kmp_thread_pool_insert_pt != NULL) {
5617     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5618     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5619       __kmp_thread_pool_insert_pt = NULL;
5620     }
5621   }
5622 
5623   // Scan down the list to find the place to insert the thread.
5624   // scan is the address of a link in the list, possibly the address of
5625   // __kmp_thread_pool itself.
5626   //
5627   // In the absence of nested parallism, the for loop will have 0 iterations.
5628   if (__kmp_thread_pool_insert_pt != NULL) {
5629     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5630   } else {
5631     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5632   }
5633   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5634        scan = &((*scan)->th.th_next_pool))
5635     ;
5636 
5637   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5638   // to its address.
5639   TCW_PTR(this_th->th.th_next_pool, *scan);
5640   __kmp_thread_pool_insert_pt = *scan = this_th;
5641   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5642                    (this_th->th.th_info.ds.ds_gtid <
5643                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5644   TCW_4(this_th->th.th_in_pool, TRUE);
5645   __kmp_thread_pool_nth++;
5646 
5647   TCW_4(__kmp_nth, __kmp_nth - 1);
5648   root->r.r_cg_nthreads--;
5649 
5650 #ifdef KMP_ADJUST_BLOCKTIME
5651   /* Adjust blocktime back to user setting or default if necessary */
5652   /* Middle initialization might never have occurred                */
5653   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5654     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5655     if (__kmp_nth <= __kmp_avail_proc) {
5656       __kmp_zero_bt = FALSE;
5657     }
5658   }
5659 #endif /* KMP_ADJUST_BLOCKTIME */
5660 
5661   KMP_MB();
5662 }
5663 
5664 /* ------------------------------------------------------------------------ */
5665 
5666 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5667   int gtid = this_thr->th.th_info.ds.ds_gtid;
5668   /*    void                 *stack_data;*/
5669   kmp_team_t *(*volatile pteam);
5670 
5671   KMP_MB();
5672   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5673 
5674   if (__kmp_env_consistency_check) {
5675     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5676   }
5677 
5678 #if OMPT_SUPPORT
5679   ompt_data_t *thread_data;
5680   if (ompt_enabled.enabled) {
5681     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5682     *thread_data = ompt_data_none;
5683 
5684     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5685     this_thr->th.ompt_thread_info.wait_id = 0;
5686     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5687     if (ompt_enabled.ompt_callback_thread_begin) {
5688       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5689           ompt_thread_worker, thread_data);
5690     }
5691   }
5692 #endif
5693 
5694 #if OMPT_SUPPORT
5695   if (ompt_enabled.enabled) {
5696     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5697   }
5698 #endif
5699   /* This is the place where threads wait for work */
5700   while (!TCR_4(__kmp_global.g.g_done)) {
5701     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5702     KMP_MB();
5703 
5704     /* wait for work to do */
5705     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5706 
5707     /* No tid yet since not part of a team */
5708     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5709 
5710 #if OMPT_SUPPORT
5711     if (ompt_enabled.enabled) {
5712       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5713     }
5714 #endif
5715 
5716     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5717 
5718     /* have we been allocated? */
5719     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5720       /* we were just woken up, so run our new task */
5721       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5722         int rc;
5723         KA_TRACE(20,
5724                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5725                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5726                   (*pteam)->t.t_pkfn));
5727 
5728         updateHWFPControl(*pteam);
5729 
5730 #if OMPT_SUPPORT
5731         if (ompt_enabled.enabled) {
5732           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5733         }
5734 #endif
5735 
5736         rc = (*pteam)->t.t_invoke(gtid);
5737         KMP_ASSERT(rc);
5738 
5739         KMP_MB();
5740         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5741                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5742                       (*pteam)->t.t_pkfn));
5743       }
5744 #if OMPT_SUPPORT
5745       if (ompt_enabled.enabled) {
5746         /* no frame set while outside task */
5747         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5748 
5749         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5750       }
5751 #endif
5752       /* join barrier after parallel region */
5753       __kmp_join_barrier(gtid);
5754     }
5755   }
5756   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5757 
5758 #if OMPT_SUPPORT
5759   if (ompt_enabled.ompt_callback_thread_end) {
5760     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5761   }
5762 #endif
5763 
5764   this_thr->th.th_task_team = NULL;
5765   /* run the destructors for the threadprivate data for this thread */
5766   __kmp_common_destroy_gtid(gtid);
5767 
5768   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5769   KMP_MB();
5770   return this_thr;
5771 }
5772 
5773 /* ------------------------------------------------------------------------ */
5774 
5775 void __kmp_internal_end_dest(void *specific_gtid) {
5776 #if KMP_COMPILER_ICC
5777 #pragma warning(push)
5778 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5779 // significant bits
5780 #endif
5781   // Make sure no significant bits are lost
5782   int gtid = (kmp_intptr_t)specific_gtid - 1;
5783 #if KMP_COMPILER_ICC
5784 #pragma warning(pop)
5785 #endif
5786 
5787   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5788   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5789    * this is because 0 is reserved for the nothing-stored case */
5790 
5791   /* josh: One reason for setting the gtid specific data even when it is being
5792      destroyed by pthread is to allow gtid lookup through thread specific data
5793      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5794      that gets executed in the call to __kmp_internal_end_thread, actually
5795      gets the gtid through the thread specific data.  Setting it here seems
5796      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5797      to run smoothly.
5798      todo: get rid of this after we remove the dependence on
5799      __kmp_gtid_get_specific  */
5800   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5801     __kmp_gtid_set_specific(gtid);
5802 #ifdef KMP_TDATA_GTID
5803   __kmp_gtid = gtid;
5804 #endif
5805   __kmp_internal_end_thread(gtid);
5806 }
5807 
5808 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5809 
5810 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5811 // destructors work perfectly, but in real libomp.so I have no evidence it is
5812 // ever called. However, -fini linker option in makefile.mk works fine.
5813 
5814 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5815   __kmp_internal_end_atexit();
5816 }
5817 
5818 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5819 
5820 #endif
5821 
5822 /* [Windows] josh: when the atexit handler is called, there may still be more
5823    than one thread alive */
5824 void __kmp_internal_end_atexit(void) {
5825   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5826   /* [Windows]
5827      josh: ideally, we want to completely shutdown the library in this atexit
5828      handler, but stat code that depends on thread specific data for gtid fails
5829      because that data becomes unavailable at some point during the shutdown, so
5830      we call __kmp_internal_end_thread instead. We should eventually remove the
5831      dependency on __kmp_get_specific_gtid in the stat code and use
5832      __kmp_internal_end_library to cleanly shutdown the library.
5833 
5834      // TODO: Can some of this comment about GVS be removed?
5835      I suspect that the offending stat code is executed when the calling thread
5836      tries to clean up a dead root thread's data structures, resulting in GVS
5837      code trying to close the GVS structures for that thread, but since the stat
5838      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5839      the calling thread is cleaning up itself instead of another thread, it get
5840      confused. This happens because allowing a thread to unregister and cleanup
5841      another thread is a recent modification for addressing an issue.
5842      Based on the current design (20050722), a thread may end up
5843      trying to unregister another thread only if thread death does not trigger
5844      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5845      thread specific data destructor function to detect thread death. For
5846      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5847      is nothing.  Thus, the workaround is applicable only for Windows static
5848      stat library. */
5849   __kmp_internal_end_library(-1);
5850 #if KMP_OS_WINDOWS
5851   __kmp_close_console();
5852 #endif
5853 }
5854 
5855 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5856   // It is assumed __kmp_forkjoin_lock is acquired.
5857 
5858   int gtid;
5859 
5860   KMP_DEBUG_ASSERT(thread != NULL);
5861 
5862   gtid = thread->th.th_info.ds.ds_gtid;
5863 
5864   if (!is_root) {
5865     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5866       /* Assume the threads are at the fork barrier here */
5867       KA_TRACE(
5868           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5869                gtid));
5870       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5871        * (GEH) */
5872       ANNOTATE_HAPPENS_BEFORE(thread);
5873       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5874       __kmp_release_64(&flag);
5875     }
5876 
5877     // Terminate OS thread.
5878     __kmp_reap_worker(thread);
5879 
5880     // The thread was killed asynchronously.  If it was actively
5881     // spinning in the thread pool, decrement the global count.
5882     //
5883     // There is a small timing hole here - if the worker thread was just waking
5884     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5885     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5886     // the global counter might not get updated.
5887     //
5888     // Currently, this can only happen as the library is unloaded,
5889     // so there are no harmful side effects.
5890     if (thread->th.th_active_in_pool) {
5891       thread->th.th_active_in_pool = FALSE;
5892       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5893       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5894     }
5895 
5896     // Decrement # of [worker] threads in the pool.
5897     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5898     --__kmp_thread_pool_nth;
5899   }
5900 
5901   __kmp_free_implicit_task(thread);
5902 
5903 // Free the fast memory for tasking
5904 #if USE_FAST_MEMORY
5905   __kmp_free_fast_memory(thread);
5906 #endif /* USE_FAST_MEMORY */
5907 
5908   __kmp_suspend_uninitialize_thread(thread);
5909 
5910   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5911   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5912 
5913   --__kmp_all_nth;
5914 // __kmp_nth was decremented when thread is added to the pool.
5915 
5916 #ifdef KMP_ADJUST_BLOCKTIME
5917   /* Adjust blocktime back to user setting or default if necessary */
5918   /* Middle initialization might never have occurred                */
5919   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5920     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5921     if (__kmp_nth <= __kmp_avail_proc) {
5922       __kmp_zero_bt = FALSE;
5923     }
5924   }
5925 #endif /* KMP_ADJUST_BLOCKTIME */
5926 
5927   /* free the memory being used */
5928   if (__kmp_env_consistency_check) {
5929     if (thread->th.th_cons) {
5930       __kmp_free_cons_stack(thread->th.th_cons);
5931       thread->th.th_cons = NULL;
5932     }
5933   }
5934 
5935   if (thread->th.th_pri_common != NULL) {
5936     __kmp_free(thread->th.th_pri_common);
5937     thread->th.th_pri_common = NULL;
5938   }
5939 
5940   if (thread->th.th_task_state_memo_stack != NULL) {
5941     __kmp_free(thread->th.th_task_state_memo_stack);
5942     thread->th.th_task_state_memo_stack = NULL;
5943   }
5944 
5945 #if KMP_USE_BGET
5946   if (thread->th.th_local.bget_data != NULL) {
5947     __kmp_finalize_bget(thread);
5948   }
5949 #endif
5950 
5951 #if KMP_AFFINITY_SUPPORTED
5952   if (thread->th.th_affin_mask != NULL) {
5953     KMP_CPU_FREE(thread->th.th_affin_mask);
5954     thread->th.th_affin_mask = NULL;
5955   }
5956 #endif /* KMP_AFFINITY_SUPPORTED */
5957 
5958 #if KMP_USE_HIER_SCHED
5959   if (thread->th.th_hier_bar_data != NULL) {
5960     __kmp_free(thread->th.th_hier_bar_data);
5961     thread->th.th_hier_bar_data = NULL;
5962   }
5963 #endif
5964 
5965   __kmp_reap_team(thread->th.th_serial_team);
5966   thread->th.th_serial_team = NULL;
5967   __kmp_free(thread);
5968 
5969   KMP_MB();
5970 
5971 } // __kmp_reap_thread
5972 
5973 static void __kmp_internal_end(void) {
5974   int i;
5975 
5976   /* First, unregister the library */
5977   __kmp_unregister_library();
5978 
5979 #if KMP_OS_WINDOWS
5980   /* In Win static library, we can't tell when a root actually dies, so we
5981      reclaim the data structures for any root threads that have died but not
5982      unregistered themselves, in order to shut down cleanly.
5983      In Win dynamic library we also can't tell when a thread dies.  */
5984   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5985 // dead roots
5986 #endif
5987 
5988   for (i = 0; i < __kmp_threads_capacity; i++)
5989     if (__kmp_root[i])
5990       if (__kmp_root[i]->r.r_active)
5991         break;
5992   KMP_MB(); /* Flush all pending memory write invalidates.  */
5993   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5994 
5995   if (i < __kmp_threads_capacity) {
5996 #if KMP_USE_MONITOR
5997     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5998     KMP_MB(); /* Flush all pending memory write invalidates.  */
5999 
6000     // Need to check that monitor was initialized before reaping it. If we are
6001     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6002     // __kmp_monitor will appear to contain valid data, but it is only valid in
6003     // the parent process, not the child.
6004     // New behavior (201008): instead of keying off of the flag
6005     // __kmp_init_parallel, the monitor thread creation is keyed off
6006     // of the new flag __kmp_init_monitor.
6007     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6008     if (TCR_4(__kmp_init_monitor)) {
6009       __kmp_reap_monitor(&__kmp_monitor);
6010       TCW_4(__kmp_init_monitor, 0);
6011     }
6012     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6013     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6014 #endif // KMP_USE_MONITOR
6015   } else {
6016 /* TODO move this to cleanup code */
6017 #ifdef KMP_DEBUG
6018     /* make sure that everything has properly ended */
6019     for (i = 0; i < __kmp_threads_capacity; i++) {
6020       if (__kmp_root[i]) {
6021         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6022         //                    there can be uber threads alive here
6023         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6024       }
6025     }
6026 #endif
6027 
6028     KMP_MB();
6029 
6030     // Reap the worker threads.
6031     // This is valid for now, but be careful if threads are reaped sooner.
6032     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6033       // Get the next thread from the pool.
6034       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6035       __kmp_thread_pool = thread->th.th_next_pool;
6036       // Reap it.
6037       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6038       thread->th.th_next_pool = NULL;
6039       thread->th.th_in_pool = FALSE;
6040       __kmp_reap_thread(thread, 0);
6041     }
6042     __kmp_thread_pool_insert_pt = NULL;
6043 
6044     // Reap teams.
6045     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6046       // Get the next team from the pool.
6047       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6048       __kmp_team_pool = team->t.t_next_pool;
6049       // Reap it.
6050       team->t.t_next_pool = NULL;
6051       __kmp_reap_team(team);
6052     }
6053 
6054     __kmp_reap_task_teams();
6055 
6056 #if KMP_OS_UNIX
6057     // Threads that are not reaped should not access any resources since they
6058     // are going to be deallocated soon, so the shutdown sequence should wait
6059     // until all threads either exit the final spin-waiting loop or begin
6060     // sleeping after the given blocktime.
6061     for (i = 0; i < __kmp_threads_capacity; i++) {
6062       kmp_info_t *thr = __kmp_threads[i];
6063       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6064         KMP_CPU_PAUSE();
6065     }
6066 #endif
6067 
6068     for (i = 0; i < __kmp_threads_capacity; ++i) {
6069       // TBD: Add some checking...
6070       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6071     }
6072 
6073     /* Make sure all threadprivate destructors get run by joining with all
6074        worker threads before resetting this flag */
6075     TCW_SYNC_4(__kmp_init_common, FALSE);
6076 
6077     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6078     KMP_MB();
6079 
6080 #if KMP_USE_MONITOR
6081     // See note above: One of the possible fixes for CQ138434 / CQ140126
6082     //
6083     // FIXME: push both code fragments down and CSE them?
6084     // push them into __kmp_cleanup() ?
6085     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6086     if (TCR_4(__kmp_init_monitor)) {
6087       __kmp_reap_monitor(&__kmp_monitor);
6088       TCW_4(__kmp_init_monitor, 0);
6089     }
6090     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6091     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6092 #endif
6093   } /* else !__kmp_global.t_active */
6094   TCW_4(__kmp_init_gtid, FALSE);
6095   KMP_MB(); /* Flush all pending memory write invalidates.  */
6096 
6097   __kmp_cleanup();
6098 #if OMPT_SUPPORT
6099   ompt_fini();
6100 #endif
6101 }
6102 
6103 void __kmp_internal_end_library(int gtid_req) {
6104   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6105   /* this shouldn't be a race condition because __kmp_internal_end() is the
6106      only place to clear __kmp_serial_init */
6107   /* we'll check this later too, after we get the lock */
6108   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6109   // redundaant, because the next check will work in any case.
6110   if (__kmp_global.g.g_abort) {
6111     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6112     /* TODO abort? */
6113     return;
6114   }
6115   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6116     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6117     return;
6118   }
6119 
6120   KMP_MB(); /* Flush all pending memory write invalidates.  */
6121 
6122   /* find out who we are and what we should do */
6123   {
6124     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6125     KA_TRACE(
6126         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6127     if (gtid == KMP_GTID_SHUTDOWN) {
6128       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6129                     "already shutdown\n"));
6130       return;
6131     } else if (gtid == KMP_GTID_MONITOR) {
6132       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6133                     "registered, or system shutdown\n"));
6134       return;
6135     } else if (gtid == KMP_GTID_DNE) {
6136       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6137                     "shutdown\n"));
6138       /* we don't know who we are, but we may still shutdown the library */
6139     } else if (KMP_UBER_GTID(gtid)) {
6140       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6141       if (__kmp_root[gtid]->r.r_active) {
6142         __kmp_global.g.g_abort = -1;
6143         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6144         KA_TRACE(10,
6145                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6146                   gtid));
6147         return;
6148       } else {
6149         KA_TRACE(
6150             10,
6151             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6152         __kmp_unregister_root_current_thread(gtid);
6153       }
6154     } else {
6155 /* worker threads may call this function through the atexit handler, if they
6156  * call exit() */
6157 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6158    TODO: do a thorough shutdown instead */
6159 #ifdef DUMP_DEBUG_ON_EXIT
6160       if (__kmp_debug_buf)
6161         __kmp_dump_debug_buffer();
6162 #endif
6163       return;
6164     }
6165   }
6166   /* synchronize the termination process */
6167   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6168 
6169   /* have we already finished */
6170   if (__kmp_global.g.g_abort) {
6171     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6172     /* TODO abort? */
6173     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6174     return;
6175   }
6176   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6177     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6178     return;
6179   }
6180 
6181   /* We need this lock to enforce mutex between this reading of
6182      __kmp_threads_capacity and the writing by __kmp_register_root.
6183      Alternatively, we can use a counter of roots that is atomically updated by
6184      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6185      __kmp_internal_end_*.  */
6186   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6187 
6188   /* now we can safely conduct the actual termination */
6189   __kmp_internal_end();
6190 
6191   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6192   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6193 
6194   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6195 
6196 #ifdef DUMP_DEBUG_ON_EXIT
6197   if (__kmp_debug_buf)
6198     __kmp_dump_debug_buffer();
6199 #endif
6200 
6201 #if KMP_OS_WINDOWS
6202   __kmp_close_console();
6203 #endif
6204 
6205   __kmp_fini_allocator();
6206 
6207 } // __kmp_internal_end_library
6208 
6209 void __kmp_internal_end_thread(int gtid_req) {
6210   int i;
6211 
6212   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6213   /* this shouldn't be a race condition because __kmp_internal_end() is the
6214    * only place to clear __kmp_serial_init */
6215   /* we'll check this later too, after we get the lock */
6216   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6217   // redundant, because the next check will work in any case.
6218   if (__kmp_global.g.g_abort) {
6219     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6220     /* TODO abort? */
6221     return;
6222   }
6223   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6224     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6225     return;
6226   }
6227 
6228   KMP_MB(); /* Flush all pending memory write invalidates.  */
6229 
6230   /* find out who we are and what we should do */
6231   {
6232     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6233     KA_TRACE(10,
6234              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6235     if (gtid == KMP_GTID_SHUTDOWN) {
6236       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6237                     "already shutdown\n"));
6238       return;
6239     } else if (gtid == KMP_GTID_MONITOR) {
6240       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6241                     "registered, or system shutdown\n"));
6242       return;
6243     } else if (gtid == KMP_GTID_DNE) {
6244       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6245                     "shutdown\n"));
6246       return;
6247       /* we don't know who we are */
6248     } else if (KMP_UBER_GTID(gtid)) {
6249       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6250       if (__kmp_root[gtid]->r.r_active) {
6251         __kmp_global.g.g_abort = -1;
6252         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6253         KA_TRACE(10,
6254                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6255                   gtid));
6256         return;
6257       } else {
6258         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6259                       gtid));
6260         __kmp_unregister_root_current_thread(gtid);
6261       }
6262     } else {
6263       /* just a worker thread, let's leave */
6264       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6265 
6266       if (gtid >= 0) {
6267         __kmp_threads[gtid]->th.th_task_team = NULL;
6268       }
6269 
6270       KA_TRACE(10,
6271                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6272                 gtid));
6273       return;
6274     }
6275   }
6276 #if KMP_DYNAMIC_LIB
6277   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6278   // thread, because we will better shutdown later in the library destructor.
6279   // The reason of this change is performance problem when non-openmp thread in
6280   // a loop forks and joins many openmp threads. We can save a lot of time
6281   // keeping worker threads alive until the program shutdown.
6282   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6283   // and Windows(DPD200287443) that occurs when using critical sections from
6284   // foreign threads.
6285   if (__kmp_pause_status != kmp_hard_paused) {
6286     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6287     return;
6288   }
6289 #endif
6290   /* synchronize the termination process */
6291   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6292 
6293   /* have we already finished */
6294   if (__kmp_global.g.g_abort) {
6295     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6296     /* TODO abort? */
6297     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6298     return;
6299   }
6300   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6301     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6302     return;
6303   }
6304 
6305   /* We need this lock to enforce mutex between this reading of
6306      __kmp_threads_capacity and the writing by __kmp_register_root.
6307      Alternatively, we can use a counter of roots that is atomically updated by
6308      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6309      __kmp_internal_end_*.  */
6310 
6311   /* should we finish the run-time?  are all siblings done? */
6312   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6313 
6314   for (i = 0; i < __kmp_threads_capacity; ++i) {
6315     if (KMP_UBER_GTID(i)) {
6316       KA_TRACE(
6317           10,
6318           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6319       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6320       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6321       return;
6322     }
6323   }
6324 
6325   /* now we can safely conduct the actual termination */
6326 
6327   __kmp_internal_end();
6328 
6329   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6330   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6331 
6332   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6333 
6334 #ifdef DUMP_DEBUG_ON_EXIT
6335   if (__kmp_debug_buf)
6336     __kmp_dump_debug_buffer();
6337 #endif
6338 } // __kmp_internal_end_thread
6339 
6340 // -----------------------------------------------------------------------------
6341 // Library registration stuff.
6342 
6343 static long __kmp_registration_flag = 0;
6344 // Random value used to indicate library initialization.
6345 static char *__kmp_registration_str = NULL;
6346 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6347 
6348 static inline char *__kmp_reg_status_name() {
6349   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6350      each thread. If registration and unregistration go in different threads
6351      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6352      env var can not be found, because the name will contain different pid. */
6353   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6354 } // __kmp_reg_status_get
6355 
6356 void __kmp_register_library_startup(void) {
6357 
6358   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6359   int done = 0;
6360   union {
6361     double dtime;
6362     long ltime;
6363   } time;
6364 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6365   __kmp_initialize_system_tick();
6366 #endif
6367   __kmp_read_system_time(&time.dtime);
6368   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6369   __kmp_registration_str =
6370       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6371                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6372 
6373   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6374                 __kmp_registration_str));
6375 
6376   while (!done) {
6377 
6378     char *value = NULL; // Actual value of the environment variable.
6379 
6380     // Set environment variable, but do not overwrite if it is exist.
6381     __kmp_env_set(name, __kmp_registration_str, 0);
6382     // Check the variable is written.
6383     value = __kmp_env_get(name);
6384     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6385 
6386       done = 1; // Ok, environment variable set successfully, exit the loop.
6387 
6388     } else {
6389 
6390       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6391       // Check whether it alive or dead.
6392       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6393       char *tail = value;
6394       char *flag_addr_str = NULL;
6395       char *flag_val_str = NULL;
6396       char const *file_name = NULL;
6397       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6398       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6399       file_name = tail;
6400       if (tail != NULL) {
6401         long *flag_addr = 0;
6402         long flag_val = 0;
6403         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6404         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6405         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6406           // First, check whether environment-encoded address is mapped into
6407           // addr space.
6408           // If so, dereference it to see if it still has the right value.
6409           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6410             neighbor = 1;
6411           } else {
6412             // If not, then we know the other copy of the library is no longer
6413             // running.
6414             neighbor = 2;
6415           }
6416         }
6417       }
6418       switch (neighbor) {
6419       case 0: // Cannot parse environment variable -- neighbor status unknown.
6420         // Assume it is the incompatible format of future version of the
6421         // library. Assume the other library is alive.
6422         // WARN( ... ); // TODO: Issue a warning.
6423         file_name = "unknown library";
6424         KMP_FALLTHROUGH();
6425       // Attention! Falling to the next case. That's intentional.
6426       case 1: { // Neighbor is alive.
6427         // Check it is allowed.
6428         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6429         if (!__kmp_str_match_true(duplicate_ok)) {
6430           // That's not allowed. Issue fatal error.
6431           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6432                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6433         }
6434         KMP_INTERNAL_FREE(duplicate_ok);
6435         __kmp_duplicate_library_ok = 1;
6436         done = 1; // Exit the loop.
6437       } break;
6438       case 2: { // Neighbor is dead.
6439         // Clear the variable and try to register library again.
6440         __kmp_env_unset(name);
6441       } break;
6442       default: { KMP_DEBUG_ASSERT(0); } break;
6443       }
6444     }
6445     KMP_INTERNAL_FREE((void *)value);
6446   }
6447   KMP_INTERNAL_FREE((void *)name);
6448 
6449 } // func __kmp_register_library_startup
6450 
6451 void __kmp_unregister_library(void) {
6452 
6453   char *name = __kmp_reg_status_name();
6454   char *value = __kmp_env_get(name);
6455 
6456   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6457   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6458   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6459     // Ok, this is our variable. Delete it.
6460     __kmp_env_unset(name);
6461   }
6462 
6463   KMP_INTERNAL_FREE(__kmp_registration_str);
6464   KMP_INTERNAL_FREE(value);
6465   KMP_INTERNAL_FREE(name);
6466 
6467   __kmp_registration_flag = 0;
6468   __kmp_registration_str = NULL;
6469 
6470 } // __kmp_unregister_library
6471 
6472 // End of Library registration stuff.
6473 // -----------------------------------------------------------------------------
6474 
6475 #if KMP_MIC_SUPPORTED
6476 
6477 static void __kmp_check_mic_type() {
6478   kmp_cpuid_t cpuid_state = {0};
6479   kmp_cpuid_t *cs_p = &cpuid_state;
6480   __kmp_x86_cpuid(1, 0, cs_p);
6481   // We don't support mic1 at the moment
6482   if ((cs_p->eax & 0xff0) == 0xB10) {
6483     __kmp_mic_type = mic2;
6484   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6485     __kmp_mic_type = mic3;
6486   } else {
6487     __kmp_mic_type = non_mic;
6488   }
6489 }
6490 
6491 #endif /* KMP_MIC_SUPPORTED */
6492 
6493 static void __kmp_do_serial_initialize(void) {
6494   int i, gtid;
6495   int size;
6496 
6497   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6498 
6499   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6500   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6501   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6502   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6503   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6504 
6505 #if OMPT_SUPPORT
6506   ompt_pre_init();
6507 #endif
6508 
6509   __kmp_validate_locks();
6510 
6511   /* Initialize internal memory allocator */
6512   __kmp_init_allocator();
6513 
6514   /* Register the library startup via an environment variable and check to see
6515      whether another copy of the library is already registered. */
6516 
6517   __kmp_register_library_startup();
6518 
6519   /* TODO reinitialization of library */
6520   if (TCR_4(__kmp_global.g.g_done)) {
6521     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6522   }
6523 
6524   __kmp_global.g.g_abort = 0;
6525   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6526 
6527 /* initialize the locks */
6528 #if KMP_USE_ADAPTIVE_LOCKS
6529 #if KMP_DEBUG_ADAPTIVE_LOCKS
6530   __kmp_init_speculative_stats();
6531 #endif
6532 #endif
6533 #if KMP_STATS_ENABLED
6534   __kmp_stats_init();
6535 #endif
6536   __kmp_init_lock(&__kmp_global_lock);
6537   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6538   __kmp_init_lock(&__kmp_debug_lock);
6539   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6540   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6541   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6542   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6543   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6544   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6545   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6546   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6547   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6548   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6549   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6550   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6551   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6552   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6553   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6554 #if KMP_USE_MONITOR
6555   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6556 #endif
6557   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6558 
6559   /* conduct initialization and initial setup of configuration */
6560 
6561   __kmp_runtime_initialize();
6562 
6563 #if KMP_MIC_SUPPORTED
6564   __kmp_check_mic_type();
6565 #endif
6566 
6567 // Some global variable initialization moved here from kmp_env_initialize()
6568 #ifdef KMP_DEBUG
6569   kmp_diag = 0;
6570 #endif
6571   __kmp_abort_delay = 0;
6572 
6573   // From __kmp_init_dflt_team_nth()
6574   /* assume the entire machine will be used */
6575   __kmp_dflt_team_nth_ub = __kmp_xproc;
6576   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6577     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6578   }
6579   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6580     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6581   }
6582   __kmp_max_nth = __kmp_sys_max_nth;
6583   __kmp_cg_max_nth = __kmp_sys_max_nth;
6584   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6585   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6586     __kmp_teams_max_nth = __kmp_sys_max_nth;
6587   }
6588 
6589   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6590   // part
6591   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6592 #if KMP_USE_MONITOR
6593   __kmp_monitor_wakeups =
6594       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6595   __kmp_bt_intervals =
6596       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6597 #endif
6598   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6599   __kmp_library = library_throughput;
6600   // From KMP_SCHEDULE initialization
6601   __kmp_static = kmp_sch_static_balanced;
6602 // AC: do not use analytical here, because it is non-monotonous
6603 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6604 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6605 // need to repeat assignment
6606 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6607 // bit control and barrier method control parts
6608 #if KMP_FAST_REDUCTION_BARRIER
6609 #define kmp_reduction_barrier_gather_bb ((int)1)
6610 #define kmp_reduction_barrier_release_bb ((int)1)
6611 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6612 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6613 #endif // KMP_FAST_REDUCTION_BARRIER
6614   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6615     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6616     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6617     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6618     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6619 #if KMP_FAST_REDUCTION_BARRIER
6620     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6621       // lin_64 ): hyper,1
6622       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6623       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6624       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6625       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6626     }
6627 #endif // KMP_FAST_REDUCTION_BARRIER
6628   }
6629 #if KMP_FAST_REDUCTION_BARRIER
6630 #undef kmp_reduction_barrier_release_pat
6631 #undef kmp_reduction_barrier_gather_pat
6632 #undef kmp_reduction_barrier_release_bb
6633 #undef kmp_reduction_barrier_gather_bb
6634 #endif // KMP_FAST_REDUCTION_BARRIER
6635 #if KMP_MIC_SUPPORTED
6636   if (__kmp_mic_type == mic2) { // KNC
6637     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6638     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6639     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6640         1; // forkjoin release
6641     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6642     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6643   }
6644 #if KMP_FAST_REDUCTION_BARRIER
6645   if (__kmp_mic_type == mic2) { // KNC
6646     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6647     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6648   }
6649 #endif // KMP_FAST_REDUCTION_BARRIER
6650 #endif // KMP_MIC_SUPPORTED
6651 
6652 // From KMP_CHECKS initialization
6653 #ifdef KMP_DEBUG
6654   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6655 #else
6656   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6657 #endif
6658 
6659   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6660   __kmp_foreign_tp = TRUE;
6661 
6662   __kmp_global.g.g_dynamic = FALSE;
6663   __kmp_global.g.g_dynamic_mode = dynamic_default;
6664 
6665   __kmp_env_initialize(NULL);
6666 
6667 // Print all messages in message catalog for testing purposes.
6668 #ifdef KMP_DEBUG
6669   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6670   if (__kmp_str_match_true(val)) {
6671     kmp_str_buf_t buffer;
6672     __kmp_str_buf_init(&buffer);
6673     __kmp_i18n_dump_catalog(&buffer);
6674     __kmp_printf("%s", buffer.str);
6675     __kmp_str_buf_free(&buffer);
6676   }
6677   __kmp_env_free(&val);
6678 #endif
6679 
6680   __kmp_threads_capacity =
6681       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6682   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6683   __kmp_tp_capacity = __kmp_default_tp_capacity(
6684       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6685 
6686   // If the library is shut down properly, both pools must be NULL. Just in
6687   // case, set them to NULL -- some memory may leak, but subsequent code will
6688   // work even if pools are not freed.
6689   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6690   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6691   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6692   __kmp_thread_pool = NULL;
6693   __kmp_thread_pool_insert_pt = NULL;
6694   __kmp_team_pool = NULL;
6695 
6696   /* Allocate all of the variable sized records */
6697   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6698    * expandable */
6699   /* Since allocation is cache-aligned, just add extra padding at the end */
6700   size =
6701       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6702       CACHE_LINE;
6703   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6704   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6705                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6706 
6707   /* init thread counts */
6708   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6709                    0); // Asserts fail if the library is reinitializing and
6710   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6711   __kmp_all_nth = 0;
6712   __kmp_nth = 0;
6713 
6714   /* setup the uber master thread and hierarchy */
6715   gtid = __kmp_register_root(TRUE);
6716   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6717   KMP_ASSERT(KMP_UBER_GTID(gtid));
6718   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6719 
6720   KMP_MB(); /* Flush all pending memory write invalidates.  */
6721 
6722   __kmp_common_initialize();
6723 
6724 #if KMP_OS_UNIX
6725   /* invoke the child fork handler */
6726   __kmp_register_atfork();
6727 #endif
6728 
6729 #if !KMP_DYNAMIC_LIB
6730   {
6731     /* Invoke the exit handler when the program finishes, only for static
6732        library. For dynamic library, we already have _fini and DllMain. */
6733     int rc = atexit(__kmp_internal_end_atexit);
6734     if (rc != 0) {
6735       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6736                   __kmp_msg_null);
6737     }
6738   }
6739 #endif
6740 
6741 #if KMP_HANDLE_SIGNALS
6742 #if KMP_OS_UNIX
6743   /* NOTE: make sure that this is called before the user installs their own
6744      signal handlers so that the user handlers are called first. this way they
6745      can return false, not call our handler, avoid terminating the library, and
6746      continue execution where they left off. */
6747   __kmp_install_signals(FALSE);
6748 #endif /* KMP_OS_UNIX */
6749 #if KMP_OS_WINDOWS
6750   __kmp_install_signals(TRUE);
6751 #endif /* KMP_OS_WINDOWS */
6752 #endif
6753 
6754   /* we have finished the serial initialization */
6755   __kmp_init_counter++;
6756 
6757   __kmp_init_serial = TRUE;
6758 
6759   if (__kmp_settings) {
6760     __kmp_env_print();
6761   }
6762 
6763 #if OMP_40_ENABLED
6764   if (__kmp_display_env || __kmp_display_env_verbose) {
6765     __kmp_env_print_2();
6766   }
6767 #endif // OMP_40_ENABLED
6768 
6769 #if OMPT_SUPPORT
6770   ompt_post_init();
6771 #endif
6772 
6773   KMP_MB();
6774 
6775   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6776 }
6777 
6778 void __kmp_serial_initialize(void) {
6779   if (__kmp_init_serial) {
6780     return;
6781   }
6782   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6783   if (__kmp_init_serial) {
6784     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6785     return;
6786   }
6787   __kmp_do_serial_initialize();
6788   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6789 }
6790 
6791 static void __kmp_do_middle_initialize(void) {
6792   int i, j;
6793   int prev_dflt_team_nth;
6794 
6795   if (!__kmp_init_serial) {
6796     __kmp_do_serial_initialize();
6797   }
6798 
6799   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6800 
6801   // Save the previous value for the __kmp_dflt_team_nth so that
6802   // we can avoid some reinitialization if it hasn't changed.
6803   prev_dflt_team_nth = __kmp_dflt_team_nth;
6804 
6805 #if KMP_AFFINITY_SUPPORTED
6806   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6807   // number of cores on the machine.
6808   __kmp_affinity_initialize();
6809 
6810   // Run through the __kmp_threads array and set the affinity mask
6811   // for each root thread that is currently registered with the RTL.
6812   for (i = 0; i < __kmp_threads_capacity; i++) {
6813     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6814       __kmp_affinity_set_init_mask(i, TRUE);
6815     }
6816   }
6817 #endif /* KMP_AFFINITY_SUPPORTED */
6818 
6819   KMP_ASSERT(__kmp_xproc > 0);
6820   if (__kmp_avail_proc == 0) {
6821     __kmp_avail_proc = __kmp_xproc;
6822   }
6823 
6824   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6825   // correct them now
6826   j = 0;
6827   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6828     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6829         __kmp_avail_proc;
6830     j++;
6831   }
6832 
6833   if (__kmp_dflt_team_nth == 0) {
6834 #ifdef KMP_DFLT_NTH_CORES
6835     // Default #threads = #cores
6836     __kmp_dflt_team_nth = __kmp_ncores;
6837     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6838                   "__kmp_ncores (%d)\n",
6839                   __kmp_dflt_team_nth));
6840 #else
6841     // Default #threads = #available OS procs
6842     __kmp_dflt_team_nth = __kmp_avail_proc;
6843     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6844                   "__kmp_avail_proc(%d)\n",
6845                   __kmp_dflt_team_nth));
6846 #endif /* KMP_DFLT_NTH_CORES */
6847   }
6848 
6849   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6850     __kmp_dflt_team_nth = KMP_MIN_NTH;
6851   }
6852   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6853     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6854   }
6855 
6856   // There's no harm in continuing if the following check fails,
6857   // but it indicates an error in the previous logic.
6858   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6859 
6860   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6861     // Run through the __kmp_threads array and set the num threads icv for each
6862     // root thread that is currently registered with the RTL (which has not
6863     // already explicitly set its nthreads-var with a call to
6864     // omp_set_num_threads()).
6865     for (i = 0; i < __kmp_threads_capacity; i++) {
6866       kmp_info_t *thread = __kmp_threads[i];
6867       if (thread == NULL)
6868         continue;
6869       if (thread->th.th_current_task->td_icvs.nproc != 0)
6870         continue;
6871 
6872       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6873     }
6874   }
6875   KA_TRACE(
6876       20,
6877       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6878        __kmp_dflt_team_nth));
6879 
6880 #ifdef KMP_ADJUST_BLOCKTIME
6881   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6882   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6883     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6884     if (__kmp_nth > __kmp_avail_proc) {
6885       __kmp_zero_bt = TRUE;
6886     }
6887   }
6888 #endif /* KMP_ADJUST_BLOCKTIME */
6889 
6890   /* we have finished middle initialization */
6891   TCW_SYNC_4(__kmp_init_middle, TRUE);
6892 
6893   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6894 }
6895 
6896 void __kmp_middle_initialize(void) {
6897   if (__kmp_init_middle) {
6898     return;
6899   }
6900   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6901   if (__kmp_init_middle) {
6902     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6903     return;
6904   }
6905   __kmp_do_middle_initialize();
6906   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6907 }
6908 
6909 void __kmp_parallel_initialize(void) {
6910   int gtid = __kmp_entry_gtid(); // this might be a new root
6911 
6912   /* synchronize parallel initialization (for sibling) */
6913   if (TCR_4(__kmp_init_parallel))
6914     return;
6915   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6916   if (TCR_4(__kmp_init_parallel)) {
6917     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6918     return;
6919   }
6920 
6921   /* TODO reinitialization after we have already shut down */
6922   if (TCR_4(__kmp_global.g.g_done)) {
6923     KA_TRACE(
6924         10,
6925         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6926     __kmp_infinite_loop();
6927   }
6928 
6929   /* jc: The lock __kmp_initz_lock is already held, so calling
6930      __kmp_serial_initialize would cause a deadlock.  So we call
6931      __kmp_do_serial_initialize directly. */
6932   if (!__kmp_init_middle) {
6933     __kmp_do_middle_initialize();
6934   }
6935 
6936 #if OMP_50_ENABLED
6937   __kmp_resume_if_hard_paused();
6938 #endif
6939 
6940   /* begin initialization */
6941   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6942   KMP_ASSERT(KMP_UBER_GTID(gtid));
6943 
6944 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6945   // Save the FP control regs.
6946   // Worker threads will set theirs to these values at thread startup.
6947   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6948   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6949   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6950 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6951 
6952 #if KMP_OS_UNIX
6953 #if KMP_HANDLE_SIGNALS
6954   /*  must be after __kmp_serial_initialize  */
6955   __kmp_install_signals(TRUE);
6956 #endif
6957 #endif
6958 
6959   __kmp_suspend_initialize();
6960 
6961 #if defined(USE_LOAD_BALANCE)
6962   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6963     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6964   }
6965 #else
6966   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6967     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6968   }
6969 #endif
6970 
6971   if (__kmp_version) {
6972     __kmp_print_version_2();
6973   }
6974 
6975   /* we have finished parallel initialization */
6976   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6977 
6978   KMP_MB();
6979   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6980 
6981   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6982 }
6983 
6984 /* ------------------------------------------------------------------------ */
6985 
6986 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6987                                    kmp_team_t *team) {
6988   kmp_disp_t *dispatch;
6989 
6990   KMP_MB();
6991 
6992   /* none of the threads have encountered any constructs, yet. */
6993   this_thr->th.th_local.this_construct = 0;
6994 #if KMP_CACHE_MANAGE
6995   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6996 #endif /* KMP_CACHE_MANAGE */
6997   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6998   KMP_DEBUG_ASSERT(dispatch);
6999   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7000   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7001   // this_thr->th.th_info.ds.ds_tid ] );
7002 
7003   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7004 #if OMP_45_ENABLED
7005   dispatch->th_doacross_buf_idx =
7006       0; /* reset the doacross dispatch buffer counter */
7007 #endif
7008   if (__kmp_env_consistency_check)
7009     __kmp_push_parallel(gtid, team->t.t_ident);
7010 
7011   KMP_MB(); /* Flush all pending memory write invalidates.  */
7012 }
7013 
7014 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7015                                   kmp_team_t *team) {
7016   if (__kmp_env_consistency_check)
7017     __kmp_pop_parallel(gtid, team->t.t_ident);
7018 
7019   __kmp_finish_implicit_task(this_thr);
7020 }
7021 
7022 int __kmp_invoke_task_func(int gtid) {
7023   int rc;
7024   int tid = __kmp_tid_from_gtid(gtid);
7025   kmp_info_t *this_thr = __kmp_threads[gtid];
7026   kmp_team_t *team = this_thr->th.th_team;
7027 
7028   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7029 #if USE_ITT_BUILD
7030   if (__itt_stack_caller_create_ptr) {
7031     __kmp_itt_stack_callee_enter(
7032         (__itt_caller)
7033             team->t.t_stack_id); // inform ittnotify about entering user's code
7034   }
7035 #endif /* USE_ITT_BUILD */
7036 #if INCLUDE_SSC_MARKS
7037   SSC_MARK_INVOKING();
7038 #endif
7039 
7040 #if OMPT_SUPPORT
7041   void *dummy;
7042   void **exit_runtime_p;
7043   ompt_data_t *my_task_data;
7044   ompt_data_t *my_parallel_data;
7045   int ompt_team_size;
7046 
7047   if (ompt_enabled.enabled) {
7048     exit_runtime_p = &(
7049         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7050   } else {
7051     exit_runtime_p = &dummy;
7052   }
7053 
7054   my_task_data =
7055       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7056   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7057   if (ompt_enabled.ompt_callback_implicit_task) {
7058     ompt_team_size = team->t.t_nproc;
7059     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7060         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7061         __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7062     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7063   }
7064 #endif
7065 
7066   {
7067     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7068     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7069     rc =
7070         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7071                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
7072 #if OMPT_SUPPORT
7073                                ,
7074                                exit_runtime_p
7075 #endif
7076                                );
7077 #if OMPT_SUPPORT
7078     *exit_runtime_p = NULL;
7079 #endif
7080   }
7081 
7082 #if USE_ITT_BUILD
7083   if (__itt_stack_caller_create_ptr) {
7084     __kmp_itt_stack_callee_leave(
7085         (__itt_caller)
7086             team->t.t_stack_id); // inform ittnotify about leaving user's code
7087   }
7088 #endif /* USE_ITT_BUILD */
7089   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7090 
7091   return rc;
7092 }
7093 
7094 #if OMP_40_ENABLED
7095 void __kmp_teams_master(int gtid) {
7096   // This routine is called by all master threads in teams construct
7097   kmp_info_t *thr = __kmp_threads[gtid];
7098   kmp_team_t *team = thr->th.th_team;
7099   ident_t *loc = team->t.t_ident;
7100   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7101   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7102   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7103   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7104                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7105 // Launch league of teams now, but not let workers execute
7106 // (they hang on fork barrier until next parallel)
7107 #if INCLUDE_SSC_MARKS
7108   SSC_MARK_FORKING();
7109 #endif
7110   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7111                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7112                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7113 #if INCLUDE_SSC_MARKS
7114   SSC_MARK_JOINING();
7115 #endif
7116 
7117   // AC: last parameter "1" eliminates join barrier which won't work because
7118   // worker threads are in a fork barrier waiting for more parallel regions
7119   __kmp_join_call(loc, gtid
7120 #if OMPT_SUPPORT
7121                   ,
7122                   fork_context_intel
7123 #endif
7124                   ,
7125                   1);
7126 }
7127 
7128 int __kmp_invoke_teams_master(int gtid) {
7129   kmp_info_t *this_thr = __kmp_threads[gtid];
7130   kmp_team_t *team = this_thr->th.th_team;
7131 #if KMP_DEBUG
7132   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7133     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7134                      (void *)__kmp_teams_master);
7135 #endif
7136   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7137   __kmp_teams_master(gtid);
7138   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7139   return 1;
7140 }
7141 #endif /* OMP_40_ENABLED */
7142 
7143 /* this sets the requested number of threads for the next parallel region
7144    encountered by this team. since this should be enclosed in the forkjoin
7145    critical section it should avoid race conditions with assymmetrical nested
7146    parallelism */
7147 
7148 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7149   kmp_info_t *thr = __kmp_threads[gtid];
7150 
7151   if (num_threads > 0)
7152     thr->th.th_set_nproc = num_threads;
7153 }
7154 
7155 #if OMP_40_ENABLED
7156 
7157 /* this sets the requested number of teams for the teams region and/or
7158    the number of threads for the next parallel region encountered  */
7159 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7160                           int num_threads) {
7161   kmp_info_t *thr = __kmp_threads[gtid];
7162   KMP_DEBUG_ASSERT(num_teams >= 0);
7163   KMP_DEBUG_ASSERT(num_threads >= 0);
7164 
7165   if (num_teams == 0)
7166     num_teams = 1; // default number of teams is 1.
7167   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7168     if (!__kmp_reserve_warn) {
7169       __kmp_reserve_warn = 1;
7170       __kmp_msg(kmp_ms_warning,
7171                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7172                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7173     }
7174     num_teams = __kmp_teams_max_nth;
7175   }
7176   // Set number of teams (number of threads in the outer "parallel" of the
7177   // teams)
7178   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7179 
7180   // Remember the number of threads for inner parallel regions
7181   if (num_threads == 0) {
7182     if (!TCR_4(__kmp_init_middle))
7183       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7184     num_threads = __kmp_avail_proc / num_teams;
7185     if (num_teams * num_threads > __kmp_teams_max_nth) {
7186       // adjust num_threads w/o warning as it is not user setting
7187       num_threads = __kmp_teams_max_nth / num_teams;
7188     }
7189   } else {
7190     if (num_teams * num_threads > __kmp_teams_max_nth) {
7191       int new_threads = __kmp_teams_max_nth / num_teams;
7192       if (!__kmp_reserve_warn) { // user asked for too many threads
7193         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7194         __kmp_msg(kmp_ms_warning,
7195                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7196                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7197       }
7198       num_threads = new_threads;
7199     }
7200   }
7201   thr->th.th_teams_size.nth = num_threads;
7202 }
7203 
7204 // Set the proc_bind var to use in the following parallel region.
7205 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7206   kmp_info_t *thr = __kmp_threads[gtid];
7207   thr->th.th_set_proc_bind = proc_bind;
7208 }
7209 
7210 #endif /* OMP_40_ENABLED */
7211 
7212 /* Launch the worker threads into the microtask. */
7213 
7214 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7215   kmp_info_t *this_thr = __kmp_threads[gtid];
7216 
7217 #ifdef KMP_DEBUG
7218   int f;
7219 #endif /* KMP_DEBUG */
7220 
7221   KMP_DEBUG_ASSERT(team);
7222   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7223   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7224   KMP_MB(); /* Flush all pending memory write invalidates.  */
7225 
7226   team->t.t_construct = 0; /* no single directives seen yet */
7227   team->t.t_ordered.dt.t_value =
7228       0; /* thread 0 enters the ordered section first */
7229 
7230   /* Reset the identifiers on the dispatch buffer */
7231   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7232   if (team->t.t_max_nproc > 1) {
7233     int i;
7234     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7235       team->t.t_disp_buffer[i].buffer_index = i;
7236 #if OMP_45_ENABLED
7237       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7238 #endif
7239     }
7240   } else {
7241     team->t.t_disp_buffer[0].buffer_index = 0;
7242 #if OMP_45_ENABLED
7243     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7244 #endif
7245   }
7246 
7247   KMP_MB(); /* Flush all pending memory write invalidates.  */
7248   KMP_ASSERT(this_thr->th.th_team == team);
7249 
7250 #ifdef KMP_DEBUG
7251   for (f = 0; f < team->t.t_nproc; f++) {
7252     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7253                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7254   }
7255 #endif /* KMP_DEBUG */
7256 
7257   /* release the worker threads so they may begin working */
7258   __kmp_fork_barrier(gtid, 0);
7259 }
7260 
7261 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7262   kmp_info_t *this_thr = __kmp_threads[gtid];
7263 
7264   KMP_DEBUG_ASSERT(team);
7265   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7266   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7267   KMP_MB(); /* Flush all pending memory write invalidates.  */
7268 
7269 /* Join barrier after fork */
7270 
7271 #ifdef KMP_DEBUG
7272   if (__kmp_threads[gtid] &&
7273       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7274     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7275                  __kmp_threads[gtid]);
7276     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7277                  "team->t.t_nproc=%d\n",
7278                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7279                  team->t.t_nproc);
7280     __kmp_print_structure();
7281   }
7282   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7283                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7284 #endif /* KMP_DEBUG */
7285 
7286   __kmp_join_barrier(gtid); /* wait for everyone */
7287 #if OMPT_SUPPORT
7288   if (ompt_enabled.enabled &&
7289       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7290     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7291     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7292     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7293 #if OMPT_OPTIONAL
7294     void *codeptr = NULL;
7295     if (KMP_MASTER_TID(ds_tid) &&
7296         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7297          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7298       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7299 
7300     if (ompt_enabled.ompt_callback_sync_region_wait) {
7301       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7302           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7303     }
7304     if (ompt_enabled.ompt_callback_sync_region) {
7305       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7306           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7307     }
7308 #endif
7309     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7310       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7311           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7312     }
7313   }
7314 #endif
7315 
7316   KMP_MB(); /* Flush all pending memory write invalidates.  */
7317   KMP_ASSERT(this_thr->th.th_team == team);
7318 }
7319 
7320 /* ------------------------------------------------------------------------ */
7321 
7322 #ifdef USE_LOAD_BALANCE
7323 
7324 // Return the worker threads actively spinning in the hot team, if we
7325 // are at the outermost level of parallelism.  Otherwise, return 0.
7326 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7327   int i;
7328   int retval;
7329   kmp_team_t *hot_team;
7330 
7331   if (root->r.r_active) {
7332     return 0;
7333   }
7334   hot_team = root->r.r_hot_team;
7335   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7336     return hot_team->t.t_nproc - 1; // Don't count master thread
7337   }
7338 
7339   // Skip the master thread - it is accounted for elsewhere.
7340   retval = 0;
7341   for (i = 1; i < hot_team->t.t_nproc; i++) {
7342     if (hot_team->t.t_threads[i]->th.th_active) {
7343       retval++;
7344     }
7345   }
7346   return retval;
7347 }
7348 
7349 // Perform an automatic adjustment to the number of
7350 // threads used by the next parallel region.
7351 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7352   int retval;
7353   int pool_active;
7354   int hot_team_active;
7355   int team_curr_active;
7356   int system_active;
7357 
7358   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7359                 set_nproc));
7360   KMP_DEBUG_ASSERT(root);
7361   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7362                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7363   KMP_DEBUG_ASSERT(set_nproc > 1);
7364 
7365   if (set_nproc == 1) {
7366     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7367     return 1;
7368   }
7369 
7370   // Threads that are active in the thread pool, active in the hot team for this
7371   // particular root (if we are at the outer par level), and the currently
7372   // executing thread (to become the master) are available to add to the new
7373   // team, but are currently contributing to the system load, and must be
7374   // accounted for.
7375   pool_active = __kmp_thread_pool_active_nth;
7376   hot_team_active = __kmp_active_hot_team_nproc(root);
7377   team_curr_active = pool_active + hot_team_active + 1;
7378 
7379   // Check the system load.
7380   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7381   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7382                 "hot team active = %d\n",
7383                 system_active, pool_active, hot_team_active));
7384 
7385   if (system_active < 0) {
7386     // There was an error reading the necessary info from /proc, so use the
7387     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7388     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7389     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7390     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7391 
7392     // Make this call behave like the thread limit algorithm.
7393     retval = __kmp_avail_proc - __kmp_nth +
7394              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7395     if (retval > set_nproc) {
7396       retval = set_nproc;
7397     }
7398     if (retval < KMP_MIN_NTH) {
7399       retval = KMP_MIN_NTH;
7400     }
7401 
7402     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7403                   retval));
7404     return retval;
7405   }
7406 
7407   // There is a slight delay in the load balance algorithm in detecting new
7408   // running procs. The real system load at this instant should be at least as
7409   // large as the #active omp thread that are available to add to the team.
7410   if (system_active < team_curr_active) {
7411     system_active = team_curr_active;
7412   }
7413   retval = __kmp_avail_proc - system_active + team_curr_active;
7414   if (retval > set_nproc) {
7415     retval = set_nproc;
7416   }
7417   if (retval < KMP_MIN_NTH) {
7418     retval = KMP_MIN_NTH;
7419   }
7420 
7421   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7422   return retval;
7423 } // __kmp_load_balance_nproc()
7424 
7425 #endif /* USE_LOAD_BALANCE */
7426 
7427 /* ------------------------------------------------------------------------ */
7428 
7429 /* NOTE: this is called with the __kmp_init_lock held */
7430 void __kmp_cleanup(void) {
7431   int f;
7432 
7433   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7434 
7435   if (TCR_4(__kmp_init_parallel)) {
7436 #if KMP_HANDLE_SIGNALS
7437     __kmp_remove_signals();
7438 #endif
7439     TCW_4(__kmp_init_parallel, FALSE);
7440   }
7441 
7442   if (TCR_4(__kmp_init_middle)) {
7443 #if KMP_AFFINITY_SUPPORTED
7444     __kmp_affinity_uninitialize();
7445 #endif /* KMP_AFFINITY_SUPPORTED */
7446     __kmp_cleanup_hierarchy();
7447     TCW_4(__kmp_init_middle, FALSE);
7448   }
7449 
7450   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7451 
7452   if (__kmp_init_serial) {
7453     __kmp_runtime_destroy();
7454     __kmp_init_serial = FALSE;
7455   }
7456 
7457   __kmp_cleanup_threadprivate_caches();
7458 
7459   for (f = 0; f < __kmp_threads_capacity; f++) {
7460     if (__kmp_root[f] != NULL) {
7461       __kmp_free(__kmp_root[f]);
7462       __kmp_root[f] = NULL;
7463     }
7464   }
7465   __kmp_free(__kmp_threads);
7466   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7467   // there is no need in freeing __kmp_root.
7468   __kmp_threads = NULL;
7469   __kmp_root = NULL;
7470   __kmp_threads_capacity = 0;
7471 
7472 #if KMP_USE_DYNAMIC_LOCK
7473   __kmp_cleanup_indirect_user_locks();
7474 #else
7475   __kmp_cleanup_user_locks();
7476 #endif
7477 
7478 #if KMP_AFFINITY_SUPPORTED
7479   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7480   __kmp_cpuinfo_file = NULL;
7481 #endif /* KMP_AFFINITY_SUPPORTED */
7482 
7483 #if KMP_USE_ADAPTIVE_LOCKS
7484 #if KMP_DEBUG_ADAPTIVE_LOCKS
7485   __kmp_print_speculative_stats();
7486 #endif
7487 #endif
7488   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7489   __kmp_nested_nth.nth = NULL;
7490   __kmp_nested_nth.size = 0;
7491   __kmp_nested_nth.used = 0;
7492   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7493   __kmp_nested_proc_bind.bind_types = NULL;
7494   __kmp_nested_proc_bind.size = 0;
7495   __kmp_nested_proc_bind.used = 0;
7496 #if OMP_50_ENABLED
7497   if (__kmp_affinity_format) {
7498     KMP_INTERNAL_FREE(__kmp_affinity_format);
7499     __kmp_affinity_format = NULL;
7500   }
7501 #endif
7502 
7503   __kmp_i18n_catclose();
7504 
7505 #if KMP_USE_HIER_SCHED
7506   __kmp_hier_scheds.deallocate();
7507 #endif
7508 
7509 #if KMP_STATS_ENABLED
7510   __kmp_stats_fini();
7511 #endif
7512 
7513   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7514 }
7515 
7516 /* ------------------------------------------------------------------------ */
7517 
7518 int __kmp_ignore_mppbeg(void) {
7519   char *env;
7520 
7521   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7522     if (__kmp_str_match_false(env))
7523       return FALSE;
7524   }
7525   // By default __kmpc_begin() is no-op.
7526   return TRUE;
7527 }
7528 
7529 int __kmp_ignore_mppend(void) {
7530   char *env;
7531 
7532   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7533     if (__kmp_str_match_false(env))
7534       return FALSE;
7535   }
7536   // By default __kmpc_end() is no-op.
7537   return TRUE;
7538 }
7539 
7540 void __kmp_internal_begin(void) {
7541   int gtid;
7542   kmp_root_t *root;
7543 
7544   /* this is a very important step as it will register new sibling threads
7545      and assign these new uber threads a new gtid */
7546   gtid = __kmp_entry_gtid();
7547   root = __kmp_threads[gtid]->th.th_root;
7548   KMP_ASSERT(KMP_UBER_GTID(gtid));
7549 
7550   if (root->r.r_begin)
7551     return;
7552   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7553   if (root->r.r_begin) {
7554     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7555     return;
7556   }
7557 
7558   root->r.r_begin = TRUE;
7559 
7560   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7561 }
7562 
7563 /* ------------------------------------------------------------------------ */
7564 
7565 void __kmp_user_set_library(enum library_type arg) {
7566   int gtid;
7567   kmp_root_t *root;
7568   kmp_info_t *thread;
7569 
7570   /* first, make sure we are initialized so we can get our gtid */
7571 
7572   gtid = __kmp_entry_gtid();
7573   thread = __kmp_threads[gtid];
7574 
7575   root = thread->th.th_root;
7576 
7577   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7578                 library_serial));
7579   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7580                                   thread */
7581     KMP_WARNING(SetLibraryIncorrectCall);
7582     return;
7583   }
7584 
7585   switch (arg) {
7586   case library_serial:
7587     thread->th.th_set_nproc = 0;
7588     set__nproc(thread, 1);
7589     break;
7590   case library_turnaround:
7591     thread->th.th_set_nproc = 0;
7592     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7593                                            : __kmp_dflt_team_nth_ub);
7594     break;
7595   case library_throughput:
7596     thread->th.th_set_nproc = 0;
7597     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7598                                            : __kmp_dflt_team_nth_ub);
7599     break;
7600   default:
7601     KMP_FATAL(UnknownLibraryType, arg);
7602   }
7603 
7604   __kmp_aux_set_library(arg);
7605 }
7606 
7607 void __kmp_aux_set_stacksize(size_t arg) {
7608   if (!__kmp_init_serial)
7609     __kmp_serial_initialize();
7610 
7611 #if KMP_OS_DARWIN
7612   if (arg & (0x1000 - 1)) {
7613     arg &= ~(0x1000 - 1);
7614     if (arg + 0x1000) /* check for overflow if we round up */
7615       arg += 0x1000;
7616   }
7617 #endif
7618   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7619 
7620   /* only change the default stacksize before the first parallel region */
7621   if (!TCR_4(__kmp_init_parallel)) {
7622     size_t value = arg; /* argument is in bytes */
7623 
7624     if (value < __kmp_sys_min_stksize)
7625       value = __kmp_sys_min_stksize;
7626     else if (value > KMP_MAX_STKSIZE)
7627       value = KMP_MAX_STKSIZE;
7628 
7629     __kmp_stksize = value;
7630 
7631     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7632   }
7633 
7634   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7635 }
7636 
7637 /* set the behaviour of the runtime library */
7638 /* TODO this can cause some odd behaviour with sibling parallelism... */
7639 void __kmp_aux_set_library(enum library_type arg) {
7640   __kmp_library = arg;
7641 
7642   switch (__kmp_library) {
7643   case library_serial: {
7644     KMP_INFORM(LibraryIsSerial);
7645     (void)__kmp_change_library(TRUE);
7646   } break;
7647   case library_turnaround:
7648     (void)__kmp_change_library(TRUE);
7649     break;
7650   case library_throughput:
7651     (void)__kmp_change_library(FALSE);
7652     break;
7653   default:
7654     KMP_FATAL(UnknownLibraryType, arg);
7655   }
7656 }
7657 
7658 /* Getting team information common for all team API */
7659 // Returns NULL if not in teams construct
7660 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7661   kmp_info_t *thr = __kmp_entry_thread();
7662   teams_serialized = 0;
7663   if (thr->th.th_teams_microtask) {
7664     kmp_team_t *team = thr->th.th_team;
7665     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7666     int ii = team->t.t_level;
7667     teams_serialized = team->t.t_serialized;
7668     int level = tlevel + 1;
7669     KMP_DEBUG_ASSERT(ii >= tlevel);
7670     while (ii > level) {
7671       for (teams_serialized = team->t.t_serialized;
7672            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7673       }
7674       if (team->t.t_serialized && (!teams_serialized)) {
7675         team = team->t.t_parent;
7676         continue;
7677       }
7678       if (ii > level) {
7679         team = team->t.t_parent;
7680         ii--;
7681       }
7682     }
7683     return team;
7684   }
7685   return NULL;
7686 }
7687 
7688 int __kmp_aux_get_team_num() {
7689   int serialized;
7690   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7691   if (team) {
7692     if (serialized > 1) {
7693       return 0; // teams region is serialized ( 1 team of 1 thread ).
7694     } else {
7695       return team->t.t_master_tid;
7696     }
7697   }
7698   return 0;
7699 }
7700 
7701 int __kmp_aux_get_num_teams() {
7702   int serialized;
7703   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7704   if (team) {
7705     if (serialized > 1) {
7706       return 1;
7707     } else {
7708       return team->t.t_parent->t.t_nproc;
7709     }
7710   }
7711   return 1;
7712 }
7713 
7714 /* ------------------------------------------------------------------------ */
7715 
7716 #if OMP_50_ENABLED
7717 /*
7718  * Affinity Format Parser
7719  *
7720  * Field is in form of: %[[[0].]size]type
7721  * % and type are required (%% means print a literal '%')
7722  * type is either single char or long name surrounded by {},
7723  * e.g., N or {num_threads}
7724  * 0 => leading zeros
7725  * . => right justified when size is specified
7726  * by default output is left justified
7727  * size is the *minimum* field length
7728  * All other characters are printed as is
7729  *
7730  * Available field types:
7731  * L {thread_level}      - omp_get_level()
7732  * n {thread_num}        - omp_get_thread_num()
7733  * h {host}              - name of host machine
7734  * P {process_id}        - process id (integer)
7735  * T {thread_identifier} - native thread identifier (integer)
7736  * N {num_threads}       - omp_get_num_threads()
7737  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7738  * a {thread_affinity}   - comma separated list of integers or integer ranges
7739  *                         (values of affinity mask)
7740  *
7741  * Implementation-specific field types can be added
7742  * If a type is unknown, print "undefined"
7743 */
7744 
7745 // Structure holding the short name, long name, and corresponding data type
7746 // for snprintf.  A table of these will represent the entire valid keyword
7747 // field types.
7748 typedef struct kmp_affinity_format_field_t {
7749   char short_name; // from spec e.g., L -> thread level
7750   const char *long_name; // from spec thread_level -> thread level
7751   char field_format; // data type for snprintf (typically 'd' or 's'
7752   // for integer or string)
7753 } kmp_affinity_format_field_t;
7754 
7755 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7756 #if KMP_AFFINITY_SUPPORTED
7757     {'A', "thread_affinity", 's'},
7758 #endif
7759     {'t', "team_num", 'd'},
7760     {'T', "num_teams", 'd'},
7761     {'L', "nesting_level", 'd'},
7762     {'n', "thread_num", 'd'},
7763     {'N', "num_threads", 'd'},
7764     {'a', "ancestor_tnum", 'd'},
7765     {'H', "host", 's'},
7766     {'P', "process_id", 'd'},
7767     {'i', "native_thread_id", 'd'}};
7768 
7769 // Return the number of characters it takes to hold field
7770 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7771                                             const char **ptr,
7772                                             kmp_str_buf_t *field_buffer) {
7773   int rc, format_index, field_value;
7774   const char *width_left, *width_right;
7775   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7776   static const int FORMAT_SIZE = 20;
7777   char format[FORMAT_SIZE] = {0};
7778   char absolute_short_name = 0;
7779 
7780   KMP_DEBUG_ASSERT(gtid >= 0);
7781   KMP_DEBUG_ASSERT(th);
7782   KMP_DEBUG_ASSERT(**ptr == '%');
7783   KMP_DEBUG_ASSERT(field_buffer);
7784 
7785   __kmp_str_buf_clear(field_buffer);
7786 
7787   // Skip the initial %
7788   (*ptr)++;
7789 
7790   // Check for %% first
7791   if (**ptr == '%') {
7792     __kmp_str_buf_cat(field_buffer, "%", 1);
7793     (*ptr)++; // skip over the second %
7794     return 1;
7795   }
7796 
7797   // Parse field modifiers if they are present
7798   pad_zeros = false;
7799   if (**ptr == '0') {
7800     pad_zeros = true;
7801     (*ptr)++; // skip over 0
7802   }
7803   right_justify = false;
7804   if (**ptr == '.') {
7805     right_justify = true;
7806     (*ptr)++; // skip over .
7807   }
7808   // Parse width of field: [width_left, width_right)
7809   width_left = width_right = NULL;
7810   if (**ptr >= '0' && **ptr <= '9') {
7811     width_left = *ptr;
7812     SKIP_DIGITS(*ptr);
7813     width_right = *ptr;
7814   }
7815 
7816   // Create the format for KMP_SNPRINTF based on flags parsed above
7817   format_index = 0;
7818   format[format_index++] = '%';
7819   if (!right_justify)
7820     format[format_index++] = '-';
7821   if (pad_zeros)
7822     format[format_index++] = '0';
7823   if (width_left && width_right) {
7824     int i = 0;
7825     // Only allow 8 digit number widths.
7826     // This also prevents overflowing format variable
7827     while (i < 8 && width_left < width_right) {
7828       format[format_index++] = *width_left;
7829       width_left++;
7830       i++;
7831     }
7832   }
7833 
7834   // Parse a name (long or short)
7835   // Canonicalize the name into absolute_short_name
7836   found_valid_name = false;
7837   parse_long_name = (**ptr == '{');
7838   if (parse_long_name)
7839     (*ptr)++; // skip initial left brace
7840   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7841                              sizeof(__kmp_affinity_format_table[0]);
7842        ++i) {
7843     char short_name = __kmp_affinity_format_table[i].short_name;
7844     const char *long_name = __kmp_affinity_format_table[i].long_name;
7845     char field_format = __kmp_affinity_format_table[i].field_format;
7846     if (parse_long_name) {
7847       int length = KMP_STRLEN(long_name);
7848       if (strncmp(*ptr, long_name, length) == 0) {
7849         found_valid_name = true;
7850         (*ptr) += length; // skip the long name
7851       }
7852     } else if (**ptr == short_name) {
7853       found_valid_name = true;
7854       (*ptr)++; // skip the short name
7855     }
7856     if (found_valid_name) {
7857       format[format_index++] = field_format;
7858       format[format_index++] = '\0';
7859       absolute_short_name = short_name;
7860       break;
7861     }
7862   }
7863   if (parse_long_name) {
7864     if (**ptr != '}') {
7865       absolute_short_name = 0;
7866     } else {
7867       (*ptr)++; // skip over the right brace
7868     }
7869   }
7870 
7871   // Attempt to fill the buffer with the requested
7872   // value using snprintf within __kmp_str_buf_print()
7873   switch (absolute_short_name) {
7874   case 't':
7875     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7876     break;
7877   case 'T':
7878     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7879     break;
7880   case 'L':
7881     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7882     break;
7883   case 'n':
7884     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7885     break;
7886   case 'H': {
7887     static const int BUFFER_SIZE = 256;
7888     char buf[BUFFER_SIZE];
7889     __kmp_expand_host_name(buf, BUFFER_SIZE);
7890     rc = __kmp_str_buf_print(field_buffer, format, buf);
7891   } break;
7892   case 'P':
7893     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7894     break;
7895   case 'i':
7896     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7897     break;
7898   case 'N':
7899     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7900     break;
7901   case 'a':
7902     field_value =
7903         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7904     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7905     break;
7906 #if KMP_AFFINITY_SUPPORTED
7907   case 'A': {
7908     kmp_str_buf_t buf;
7909     __kmp_str_buf_init(&buf);
7910     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7911     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7912     __kmp_str_buf_free(&buf);
7913   } break;
7914 #endif
7915   default:
7916     // According to spec, If an implementation does not have info for field
7917     // type, then "undefined" is printed
7918     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7919     // Skip the field
7920     if (parse_long_name) {
7921       SKIP_TOKEN(*ptr);
7922       if (**ptr == '}')
7923         (*ptr)++;
7924     } else {
7925       (*ptr)++;
7926     }
7927   }
7928 
7929   KMP_ASSERT(format_index <= FORMAT_SIZE);
7930   return rc;
7931 }
7932 
7933 /*
7934  * Return number of characters needed to hold the affinity string
7935  * (not including null byte character)
7936  * The resultant string is printed to buffer, which the caller can then
7937  * handle afterwards
7938 */
7939 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7940                                   kmp_str_buf_t *buffer) {
7941   const char *parse_ptr;
7942   size_t retval;
7943   const kmp_info_t *th;
7944   kmp_str_buf_t field;
7945 
7946   KMP_DEBUG_ASSERT(buffer);
7947   KMP_DEBUG_ASSERT(gtid >= 0);
7948 
7949   __kmp_str_buf_init(&field);
7950   __kmp_str_buf_clear(buffer);
7951 
7952   th = __kmp_threads[gtid];
7953   retval = 0;
7954 
7955   // If format is NULL or zero-length string, then we use
7956   // affinity-format-var ICV
7957   parse_ptr = format;
7958   if (parse_ptr == NULL || *parse_ptr == '\0') {
7959     parse_ptr = __kmp_affinity_format;
7960   }
7961   KMP_DEBUG_ASSERT(parse_ptr);
7962 
7963   while (*parse_ptr != '\0') {
7964     // Parse a field
7965     if (*parse_ptr == '%') {
7966       // Put field in the buffer
7967       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7968       __kmp_str_buf_catbuf(buffer, &field);
7969       retval += rc;
7970     } else {
7971       // Put literal character in buffer
7972       __kmp_str_buf_cat(buffer, parse_ptr, 1);
7973       retval++;
7974       parse_ptr++;
7975     }
7976   }
7977   __kmp_str_buf_free(&field);
7978   return retval;
7979 }
7980 
7981 // Displays the affinity string to stdout
7982 void __kmp_aux_display_affinity(int gtid, const char *format) {
7983   kmp_str_buf_t buf;
7984   __kmp_str_buf_init(&buf);
7985   __kmp_aux_capture_affinity(gtid, format, &buf);
7986   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7987   __kmp_str_buf_free(&buf);
7988 }
7989 #endif // OMP_50_ENABLED
7990 
7991 /* ------------------------------------------------------------------------ */
7992 
7993 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7994   int blocktime = arg; /* argument is in milliseconds */
7995 #if KMP_USE_MONITOR
7996   int bt_intervals;
7997 #endif
7998   int bt_set;
7999 
8000   __kmp_save_internal_controls(thread);
8001 
8002   /* Normalize and set blocktime for the teams */
8003   if (blocktime < KMP_MIN_BLOCKTIME)
8004     blocktime = KMP_MIN_BLOCKTIME;
8005   else if (blocktime > KMP_MAX_BLOCKTIME)
8006     blocktime = KMP_MAX_BLOCKTIME;
8007 
8008   set__blocktime_team(thread->th.th_team, tid, blocktime);
8009   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8010 
8011 #if KMP_USE_MONITOR
8012   /* Calculate and set blocktime intervals for the teams */
8013   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8014 
8015   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8016   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8017 #endif
8018 
8019   /* Set whether blocktime has been set to "TRUE" */
8020   bt_set = TRUE;
8021 
8022   set__bt_set_team(thread->th.th_team, tid, bt_set);
8023   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8024 #if KMP_USE_MONITOR
8025   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8026                 "bt_intervals=%d, monitor_updates=%d\n",
8027                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8028                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8029                 __kmp_monitor_wakeups));
8030 #else
8031   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8032                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8033                 thread->th.th_team->t.t_id, tid, blocktime));
8034 #endif
8035 }
8036 
8037 void __kmp_aux_set_defaults(char const *str, int len) {
8038   if (!__kmp_init_serial) {
8039     __kmp_serial_initialize();
8040   }
8041   __kmp_env_initialize(str);
8042 
8043   if (__kmp_settings
8044 #if OMP_40_ENABLED
8045       || __kmp_display_env || __kmp_display_env_verbose
8046 #endif // OMP_40_ENABLED
8047       ) {
8048     __kmp_env_print();
8049   }
8050 } // __kmp_aux_set_defaults
8051 
8052 /* ------------------------------------------------------------------------ */
8053 /* internal fast reduction routines */
8054 
8055 PACKED_REDUCTION_METHOD_T
8056 __kmp_determine_reduction_method(
8057     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8058     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8059     kmp_critical_name *lck) {
8060 
8061   // Default reduction method: critical construct ( lck != NULL, like in current
8062   // PAROPT )
8063   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8064   // can be selected by RTL
8065   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8066   // can be selected by RTL
8067   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8068   // among generated by PAROPT.
8069 
8070   PACKED_REDUCTION_METHOD_T retval;
8071 
8072   int team_size;
8073 
8074   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8075   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8076 
8077 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8078   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8079 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8080 
8081   retval = critical_reduce_block;
8082 
8083   // another choice of getting a team size (with 1 dynamic deference) is slower
8084   team_size = __kmp_get_team_num_threads(global_tid);
8085   if (team_size == 1) {
8086 
8087     retval = empty_reduce_block;
8088 
8089   } else {
8090 
8091     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8092 
8093 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8094 
8095 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8096     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8097 
8098     int teamsize_cutoff = 4;
8099 
8100 #if KMP_MIC_SUPPORTED
8101     if (__kmp_mic_type != non_mic) {
8102       teamsize_cutoff = 8;
8103     }
8104 #endif
8105     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8106     if (tree_available) {
8107       if (team_size <= teamsize_cutoff) {
8108         if (atomic_available) {
8109           retval = atomic_reduce_block;
8110         }
8111       } else {
8112         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8113       }
8114     } else if (atomic_available) {
8115       retval = atomic_reduce_block;
8116     }
8117 #else
8118 #error "Unknown or unsupported OS"
8119 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8120        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8121 
8122 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8123 
8124 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
8125 
8126     // basic tuning
8127 
8128     if (atomic_available) {
8129       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8130         retval = atomic_reduce_block;
8131       }
8132     } // otherwise: use critical section
8133 
8134 #elif KMP_OS_DARWIN
8135 
8136     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8137     if (atomic_available && (num_vars <= 3)) {
8138       retval = atomic_reduce_block;
8139     } else if (tree_available) {
8140       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8141           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8142         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8143       }
8144     } // otherwise: use critical section
8145 
8146 #else
8147 #error "Unknown or unsupported OS"
8148 #endif
8149 
8150 #else
8151 #error "Unknown or unsupported architecture"
8152 #endif
8153   }
8154 
8155   // KMP_FORCE_REDUCTION
8156 
8157   // If the team is serialized (team_size == 1), ignore the forced reduction
8158   // method and stay with the unsynchronized method (empty_reduce_block)
8159   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8160       team_size != 1) {
8161 
8162     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8163 
8164     int atomic_available, tree_available;
8165 
8166     switch ((forced_retval = __kmp_force_reduction_method)) {
8167     case critical_reduce_block:
8168       KMP_ASSERT(lck); // lck should be != 0
8169       break;
8170 
8171     case atomic_reduce_block:
8172       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8173       if (!atomic_available) {
8174         KMP_WARNING(RedMethodNotSupported, "atomic");
8175         forced_retval = critical_reduce_block;
8176       }
8177       break;
8178 
8179     case tree_reduce_block:
8180       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8181       if (!tree_available) {
8182         KMP_WARNING(RedMethodNotSupported, "tree");
8183         forced_retval = critical_reduce_block;
8184       } else {
8185 #if KMP_FAST_REDUCTION_BARRIER
8186         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8187 #endif
8188       }
8189       break;
8190 
8191     default:
8192       KMP_ASSERT(0); // "unsupported method specified"
8193     }
8194 
8195     retval = forced_retval;
8196   }
8197 
8198   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8199 
8200 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8201 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8202 
8203   return (retval);
8204 }
8205 
8206 // this function is for testing set/get/determine reduce method
8207 kmp_int32 __kmp_get_reduce_method(void) {
8208   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8209 }
8210 
8211 #if OMP_50_ENABLED
8212 
8213 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8214 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8215 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8216 
8217 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8218 // OpenMP is used subsequently.
8219 void __kmp_hard_pause() {
8220   __kmp_pause_status = kmp_hard_paused;
8221   __kmp_internal_end_thread(-1);
8222 }
8223 
8224 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8225 void __kmp_resume_if_soft_paused() {
8226   if (__kmp_pause_status == kmp_soft_paused) {
8227     __kmp_pause_status = kmp_not_paused;
8228 
8229     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8230       kmp_info_t *thread = __kmp_threads[gtid];
8231       if (thread) { // Wake it if sleeping
8232         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8233         if (fl.is_sleeping())
8234           fl.resume(gtid);
8235         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8236           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8237         } else { // thread holds the lock and may sleep soon
8238           do { // until either the thread sleeps, or we can get the lock
8239             if (fl.is_sleeping()) {
8240               fl.resume(gtid);
8241               break;
8242             } else if (__kmp_try_suspend_mx(thread)) {
8243               __kmp_unlock_suspend_mx(thread);
8244               break;
8245             }
8246           } while (1);
8247         }
8248       }
8249     }
8250   }
8251 }
8252 
8253 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8254 // TODO: add warning messages
8255 int __kmp_pause_resource(kmp_pause_status_t level) {
8256   if (level == kmp_not_paused) { // requesting resume
8257     if (__kmp_pause_status == kmp_not_paused) {
8258       // error message about runtime not being paused, so can't resume
8259       return 1;
8260     } else {
8261       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8262                        __kmp_pause_status == kmp_hard_paused);
8263       __kmp_pause_status = kmp_not_paused;
8264       return 0;
8265     }
8266   } else if (level == kmp_soft_paused) { // requesting soft pause
8267     if (__kmp_pause_status != kmp_not_paused) {
8268       // error message about already being paused
8269       return 1;
8270     } else {
8271       __kmp_soft_pause();
8272       return 0;
8273     }
8274   } else if (level == kmp_hard_paused) { // requesting hard pause
8275     if (__kmp_pause_status != kmp_not_paused) {
8276       // error message about already being paused
8277       return 1;
8278     } else {
8279       __kmp_hard_pause();
8280       return 0;
8281     }
8282   } else {
8283     // error message about invalid level
8284     return 1;
8285   }
8286 }
8287 
8288 #endif // OMP_50_ENABLED
8289