1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nWish, int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376     ompt_lw_taskteam_t lw_taskteam;
1377     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378                             &ompt_parallel_data, codeptr);
1379 
1380     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381     // don't use lw_taskteam after linking. content was swaped
1382 
1383     /* OMPT implicit task begin */
1384     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385     if (ompt_enabled.ompt_callback_implicit_task) {
1386       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389     }
1390 
1391     /* OMPT state */
1392     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1393     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1394   }
1395 #endif
1396 }
1397 
1398 /* most of the work for a fork */
1399 /* return true if we really went parallel, false if serialized */
1400 int __kmp_fork_call(ident_t *loc, int gtid,
1401                     enum fork_context_e call_context, // Intel, GNU, ...
1402                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1403 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1404 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1405                     va_list *ap
1406 #else
1407                     va_list ap
1408 #endif
1409                     ) {
1410   void **argv;
1411   int i;
1412   int master_tid;
1413   int master_this_cons;
1414   kmp_team_t *team;
1415   kmp_team_t *parent_team;
1416   kmp_info_t *master_th;
1417   kmp_root_t *root;
1418   int nthreads;
1419   int master_active;
1420   int master_set_numthreads;
1421   int level;
1422 #if OMP_40_ENABLED
1423   int active_level;
1424   int teams_level;
1425 #endif
1426 #if KMP_NESTED_HOT_TEAMS
1427   kmp_hot_team_ptr_t **p_hot_teams;
1428 #endif
1429   { // KMP_TIME_BLOCK
1430     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1431     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1432 
1433     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1434     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1435       /* Some systems prefer the stack for the root thread(s) to start with */
1436       /* some gap from the parent stack to prevent false sharing. */
1437       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1438       /* These 2 lines below are so this does not get optimized out */
1439       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1440         __kmp_stkpadding += (short)((kmp_int64)dummy);
1441     }
1442 
1443     /* initialize if needed */
1444     KMP_DEBUG_ASSERT(
1445         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1446     if (!TCR_4(__kmp_init_parallel))
1447       __kmp_parallel_initialize();
1448 
1449     /* setup current data */
1450     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1451     // shutdown
1452     parent_team = master_th->th.th_team;
1453     master_tid = master_th->th.th_info.ds.ds_tid;
1454     master_this_cons = master_th->th.th_local.this_construct;
1455     root = master_th->th.th_root;
1456     master_active = root->r.r_active;
1457     master_set_numthreads = master_th->th.th_set_nproc;
1458 
1459 #if OMPT_SUPPORT
1460     ompt_data_t ompt_parallel_data;
1461     ompt_parallel_data.ptr = NULL;
1462     ompt_data_t *parent_task_data;
1463     ompt_frame_t *ompt_frame;
1464     ompt_data_t *implicit_task_data;
1465     void *return_address = NULL;
1466 
1467     if (ompt_enabled.enabled) {
1468       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1469                                     NULL, NULL);
1470       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1471     }
1472 #endif
1473 
1474     // Nested level will be an index in the nested nthreads array
1475     level = parent_team->t.t_level;
1476     // used to launch non-serial teams even if nested is not allowed
1477     active_level = parent_team->t.t_active_level;
1478 #if OMP_40_ENABLED
1479     // needed to check nesting inside the teams
1480     teams_level = master_th->th.th_teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483     p_hot_teams = &master_th->th.th_hot_teams;
1484     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1485       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1486           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1487       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1488       // it is either actual or not needed (when active_level > 0)
1489       (*p_hot_teams)[0].hot_team_nth = 1;
1490     }
1491 #endif
1492 
1493 #if OMPT_SUPPORT
1494     if (ompt_enabled.enabled) {
1495       if (ompt_enabled.ompt_callback_parallel_begin) {
1496         int team_size = master_set_numthreads
1497                             ? master_set_numthreads
1498                             : get__nproc_2(parent_team, master_tid);
1499         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1500             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1501             OMPT_INVOKER(call_context), return_address);
1502       }
1503       master_th->th.ompt_thread_info.state = omp_state_overhead;
1504     }
1505 #endif
1506 
1507     master_th->th.th_ident = loc;
1508 
1509 #if OMP_40_ENABLED
1510     if (master_th->th.th_teams_microtask && ap &&
1511         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1512       // AC: This is start of parallel that is nested inside teams construct.
1513       // The team is actual (hot), all workers are ready at the fork barrier.
1514       // No lock needed to initialize the team a bit, then free workers.
1515       parent_team->t.t_ident = loc;
1516       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1517       parent_team->t.t_argc = argc;
1518       argv = (void **)parent_team->t.t_argv;
1519       for (i = argc - 1; i >= 0; --i)
1520 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1521 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1522         *argv++ = va_arg(*ap, void *);
1523 #else
1524         *argv++ = va_arg(ap, void *);
1525 #endif
1526       // Increment our nested depth levels, but not increase the serialization
1527       if (parent_team == master_th->th.th_serial_team) {
1528         // AC: we are in serialized parallel
1529         __kmpc_serialized_parallel(loc, gtid);
1530         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1531         // AC: need this in order enquiry functions work
1532         // correctly, will restore at join time
1533         parent_team->t.t_serialized--;
1534 #if OMPT_SUPPORT
1535         void *dummy;
1536         void **exit_runtime_p;
1537 
1538         ompt_lw_taskteam_t lw_taskteam;
1539 
1540         if (ompt_enabled.enabled) {
1541           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1542                                   &ompt_parallel_data, return_address);
1543           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1544 
1545           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1546           // don't use lw_taskteam after linking. content was swaped
1547 
1548           /* OMPT implicit task begin */
1549           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1550           if (ompt_enabled.ompt_callback_implicit_task) {
1551             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1552                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1553                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1554           }
1555 
1556           /* OMPT state */
1557           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1558         } else {
1559           exit_runtime_p = &dummy;
1560         }
1561 #endif
1562 
1563         {
1564           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1565           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1566           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1567 #if OMPT_SUPPORT
1568                                  ,
1569                                  exit_runtime_p
1570 #endif
1571                                  );
1572         }
1573 
1574 #if OMPT_SUPPORT
1575         *exit_runtime_p = NULL;
1576         if (ompt_enabled.enabled) {
1577           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1578           if (ompt_enabled.ompt_callback_implicit_task) {
1579             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1580                 ompt_scope_end, NULL, implicit_task_data, 1,
1581                 __kmp_tid_from_gtid(gtid));
1582           }
1583           __ompt_lw_taskteam_unlink(master_th);
1584 
1585           if (ompt_enabled.ompt_callback_parallel_end) {
1586             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1587                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1588                 OMPT_INVOKER(call_context), return_address);
1589           }
1590           master_th->th.ompt_thread_info.state = omp_state_overhead;
1591         }
1592 #endif
1593         return TRUE;
1594       }
1595 
1596       parent_team->t.t_pkfn = microtask;
1597       parent_team->t.t_invoke = invoker;
1598       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1599       parent_team->t.t_active_level++;
1600       parent_team->t.t_level++;
1601 
1602       /* Change number of threads in the team if requested */
1603       if (master_set_numthreads) { // The parallel has num_threads clause
1604         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1605           // AC: only can reduce number of threads dynamically, can't increase
1606           kmp_info_t **other_threads = parent_team->t.t_threads;
1607           parent_team->t.t_nproc = master_set_numthreads;
1608           for (i = 0; i < master_set_numthreads; ++i) {
1609             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1610           }
1611           // Keep extra threads hot in the team for possible next parallels
1612         }
1613         master_th->th.th_set_nproc = 0;
1614       }
1615 
1616 #if USE_DEBUGGER
1617       if (__kmp_debugging) { // Let debugger override number of threads.
1618         int nth = __kmp_omp_num_threads(loc);
1619         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1620           master_set_numthreads = nth;
1621         }
1622       }
1623 #endif
1624 
1625       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1626                     "master_th=%p, gtid=%d\n",
1627                     root, parent_team, master_th, gtid));
1628       __kmp_internal_fork(loc, gtid, parent_team);
1629       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1630                     "master_th=%p, gtid=%d\n",
1631                     root, parent_team, master_th, gtid));
1632 
1633       /* Invoke microtask for MASTER thread */
1634       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1635                     parent_team->t.t_id, parent_team->t.t_pkfn));
1636 
1637       {
1638         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1639         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1640         if (!parent_team->t.t_invoke(gtid)) {
1641           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642         }
1643       }
1644       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1645                     parent_team->t.t_id, parent_team->t.t_pkfn));
1646       KMP_MB(); /* Flush all pending memory write invalidates.  */
1647 
1648       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1649 
1650       return TRUE;
1651     } // Parallel closely nested in teams construct
1652 #endif /* OMP_40_ENABLED */
1653 
1654 #if KMP_DEBUG
1655     if (__kmp_tasking_mode != tskm_immediate_exec) {
1656       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1657                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1658     }
1659 #endif
1660 
1661     if (parent_team->t.t_active_level >=
1662         master_th->th.th_current_task->td_icvs.max_active_levels) {
1663       nthreads = 1;
1664     } else {
1665 #if OMP_40_ENABLED
1666       int enter_teams = ((ap == NULL && active_level == 0) ||
1667                          (ap && teams_level > 0 && teams_level == level));
1668 #endif
1669       nthreads =
1670           master_set_numthreads
1671               ? master_set_numthreads
1672               : get__nproc_2(
1673                     parent_team,
1674                     master_tid); // TODO: get nproc directly from current task
1675 
1676       // Check if we need to take forkjoin lock? (no need for serialized
1677       // parallel out of teams construct). This code moved here from
1678       // __kmp_reserve_threads() to speedup nested serialized parallels.
1679       if (nthreads > 1) {
1680         if ((!get__nested(master_th) && (root->r.r_in_parallel
1681 #if OMP_40_ENABLED
1682                                          && !enter_teams
1683 #endif /* OMP_40_ENABLED */
1684                                          )) ||
1685             (__kmp_library == library_serial)) {
1686           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1687                         " threads\n",
1688                         gtid, nthreads));
1689           nthreads = 1;
1690         }
1691       }
1692       if (nthreads > 1) {
1693         /* determine how many new threads we can use */
1694         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1695         nthreads = __kmp_reserve_threads(
1696             root, parent_team, master_tid, nthreads
1697 #if OMP_40_ENABLED
1698             /* AC: If we execute teams from parallel region (on host), then
1699                teams should be created but each can only have 1 thread if
1700                nesting is disabled. If teams called from serial region, then
1701                teams and their threads should be created regardless of the
1702                nesting setting. */
1703             ,
1704             enter_teams
1705 #endif /* OMP_40_ENABLED */
1706             );
1707         if (nthreads == 1) {
1708           // Free lock for single thread execution here; for multi-thread
1709           // execution it will be freed later after team of threads created
1710           // and initialized
1711           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1712         }
1713       }
1714     }
1715     KMP_DEBUG_ASSERT(nthreads > 0);
1716 
1717     // If we temporarily changed the set number of threads then restore it now
1718     master_th->th.th_set_nproc = 0;
1719 
1720     /* create a serialized parallel region? */
1721     if (nthreads == 1) {
1722 /* josh todo: hypothetical question: what do we do for OS X*? */
1723 #if KMP_OS_LINUX &&                                                            \
1724     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1725       void *args[argc];
1726 #else
1727       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1728 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1729           KMP_ARCH_AARCH64) */
1730 
1731       KA_TRACE(20,
1732                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1733 
1734       __kmpc_serialized_parallel(loc, gtid);
1735 
1736       if (call_context == fork_context_intel) {
1737         /* TODO this sucks, use the compiler itself to pass args! :) */
1738         master_th->th.th_serial_team->t.t_ident = loc;
1739 #if OMP_40_ENABLED
1740         if (!ap) {
1741           // revert change made in __kmpc_serialized_parallel()
1742           master_th->th.th_serial_team->t.t_level--;
1743 // Get args from parent team for teams construct
1744 
1745 #if OMPT_SUPPORT
1746           void *dummy;
1747           void **exit_runtime_p;
1748           ompt_task_info_t *task_info;
1749 
1750           ompt_lw_taskteam_t lw_taskteam;
1751 
1752           if (ompt_enabled.enabled) {
1753             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1754                                     &ompt_parallel_data, return_address);
1755 
1756             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1757             // don't use lw_taskteam after linking. content was swaped
1758 
1759             task_info = OMPT_CUR_TASK_INFO(master_th);
1760             exit_runtime_p = &(task_info->frame.exit_frame);
1761             if (ompt_enabled.ompt_callback_implicit_task) {
1762               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1764                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1765             }
1766 
1767             /* OMPT state */
1768             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1769           } else {
1770             exit_runtime_p = &dummy;
1771           }
1772 #endif
1773 
1774           {
1775             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1776             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1777             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1778                                    parent_team->t.t_argv
1779 #if OMPT_SUPPORT
1780                                    ,
1781                                    exit_runtime_p
1782 #endif
1783                                    );
1784           }
1785 
1786 #if OMPT_SUPPORT
1787           if (ompt_enabled.enabled) {
1788             exit_runtime_p = NULL;
1789             if (ompt_enabled.ompt_callback_implicit_task) {
1790               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1792                   __kmp_tid_from_gtid(gtid));
1793             }
1794 
1795             __ompt_lw_taskteam_unlink(master_th);
1796             if (ompt_enabled.ompt_callback_parallel_end) {
1797               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1798                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1799                   OMPT_INVOKER(call_context), return_address);
1800             }
1801             master_th->th.ompt_thread_info.state = omp_state_overhead;
1802           }
1803 #endif
1804         } else if (microtask == (microtask_t)__kmp_teams_master) {
1805           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1806                            master_th->th.th_serial_team);
1807           team = master_th->th.th_team;
1808           // team->t.t_pkfn = microtask;
1809           team->t.t_invoke = invoker;
1810           __kmp_alloc_argv_entries(argc, team, TRUE);
1811           team->t.t_argc = argc;
1812           argv = (void **)team->t.t_argv;
1813           if (ap) {
1814             for (i = argc - 1; i >= 0; --i)
1815 // TODO: revert workaround for Intel(R) 64 tracker #96
1816 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1817               *argv++ = va_arg(*ap, void *);
1818 #else
1819               *argv++ = va_arg(ap, void *);
1820 #endif
1821           } else {
1822             for (i = 0; i < argc; ++i)
1823               // Get args from parent team for teams construct
1824               argv[i] = parent_team->t.t_argv[i];
1825           }
1826           // AC: revert change made in __kmpc_serialized_parallel()
1827           //     because initial code in teams should have level=0
1828           team->t.t_level--;
1829           // AC: call special invoker for outer "parallel" of teams construct
1830           {
1831             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1832             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1833             invoker(gtid);
1834           }
1835         } else {
1836 #endif /* OMP_40_ENABLED */
1837           argv = args;
1838           for (i = argc - 1; i >= 0; --i)
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841             *argv++ = va_arg(*ap, void *);
1842 #else
1843           *argv++ = va_arg(ap, void *);
1844 #endif
1845           KMP_MB();
1846 
1847 #if OMPT_SUPPORT
1848           void *dummy;
1849           void **exit_runtime_p;
1850           ompt_task_info_t *task_info;
1851 
1852           ompt_lw_taskteam_t lw_taskteam;
1853 
1854           if (ompt_enabled.enabled) {
1855             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1856                                     &ompt_parallel_data, return_address);
1857             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1858             // don't use lw_taskteam after linking. content was swaped
1859             task_info = OMPT_CUR_TASK_INFO(master_th);
1860             exit_runtime_p = &(task_info->frame.exit_frame);
1861 
1862             /* OMPT implicit task begin */
1863             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1864             if (ompt_enabled.ompt_callback_implicit_task) {
1865               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1866                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1867                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1868             }
1869 
1870             /* OMPT state */
1871             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1872           } else {
1873             exit_runtime_p = &dummy;
1874           }
1875 #endif
1876 
1877           {
1878             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882                                    ,
1883                                    exit_runtime_p
1884 #endif
1885                                    );
1886           }
1887 
1888 #if OMPT_SUPPORT
1889           if (ompt_enabled.enabled) {
1890             *exit_runtime_p = NULL;
1891             if (ompt_enabled.ompt_callback_implicit_task) {
1892               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1894                   __kmp_tid_from_gtid(gtid));
1895             }
1896 
1897             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898             __ompt_lw_taskteam_unlink(master_th);
1899             if (ompt_enabled.ompt_callback_parallel_end) {
1900               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901                   &ompt_parallel_data, parent_task_data,
1902                   OMPT_INVOKER(call_context), return_address);
1903             }
1904             master_th->th.ompt_thread_info.state = omp_state_overhead;
1905           }
1906 #endif
1907 #if OMP_40_ENABLED
1908         }
1909 #endif /* OMP_40_ENABLED */
1910       } else if (call_context == fork_context_gnu) {
1911 #if OMPT_SUPPORT
1912         ompt_lw_taskteam_t lwt;
1913         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1914                                 return_address);
1915 
1916         lwt.ompt_task_info.frame.exit_frame = NULL;
1917         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921         // we were called from GNU native code
1922         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1923         return FALSE;
1924       } else {
1925         KMP_ASSERT2(call_context < fork_context_last,
1926                     "__kmp_fork_call: unknown fork_context parameter");
1927       }
1928 
1929       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930       KMP_MB();
1931       return FALSE;
1932     }
1933 
1934     // GEH: only modify the executing flag in the case when not serialized
1935     //      serialized case is handled in kmpc_serialized_parallel
1936     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1937                   "curtask=%p, curtask_max_aclevel=%d\n",
1938                   parent_team->t.t_active_level, master_th,
1939                   master_th->th.th_current_task,
1940                   master_th->th.th_current_task->td_icvs.max_active_levels));
1941     // TODO: GEH - cannot do this assertion because root thread not set up as
1942     // executing
1943     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1944     master_th->th.th_current_task->td_flags.executing = 0;
1945 
1946 #if OMP_40_ENABLED
1947     if (!master_th->th.th_teams_microtask || level > teams_level)
1948 #endif /* OMP_40_ENABLED */
1949     {
1950       /* Increment our nested depth level */
1951       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1952     }
1953 
1954     // See if we need to make a copy of the ICVs.
1955     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1956     if ((level + 1 < __kmp_nested_nth.used) &&
1957         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1958       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1959     } else {
1960       nthreads_icv = 0; // don't update
1961     }
1962 
1963 #if OMP_40_ENABLED
1964     // Figure out the proc_bind_policy for the new team.
1965     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1966     kmp_proc_bind_t proc_bind_icv =
1967         proc_bind_default; // proc_bind_default means don't update
1968     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1969       proc_bind = proc_bind_false;
1970     } else {
1971       if (proc_bind == proc_bind_default) {
1972         // No proc_bind clause specified; use current proc-bind-var for this
1973         // parallel region
1974         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1975       }
1976       /* else: The proc_bind policy was specified explicitly on parallel clause.
1977          This overrides proc-bind-var for this parallel region, but does not
1978          change proc-bind-var. */
1979       // Figure the value of proc-bind-var for the child threads.
1980       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1981           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1982            master_th->th.th_current_task->td_icvs.proc_bind)) {
1983         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1984       }
1985     }
1986 
1987     // Reset for next parallel region
1988     master_th->th.th_set_proc_bind = proc_bind_default;
1989 #endif /* OMP_40_ENABLED */
1990 
1991     if ((nthreads_icv > 0)
1992 #if OMP_40_ENABLED
1993         || (proc_bind_icv != proc_bind_default)
1994 #endif /* OMP_40_ENABLED */
1995             ) {
1996       kmp_internal_control_t new_icvs;
1997       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1998       new_icvs.next = NULL;
1999       if (nthreads_icv > 0) {
2000         new_icvs.nproc = nthreads_icv;
2001       }
2002 
2003 #if OMP_40_ENABLED
2004       if (proc_bind_icv != proc_bind_default) {
2005         new_icvs.proc_bind = proc_bind_icv;
2006       }
2007 #endif /* OMP_40_ENABLED */
2008 
2009       /* allocate a new parallel team */
2010       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011       team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013                                  ompt_parallel_data,
2014 #endif
2015 #if OMP_40_ENABLED
2016                                  proc_bind,
2017 #endif
2018                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2019     } else {
2020       /* allocate a new parallel team */
2021       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2022       team = __kmp_allocate_team(root, nthreads, nthreads,
2023 #if OMPT_SUPPORT
2024                                  ompt_parallel_data,
2025 #endif
2026 #if OMP_40_ENABLED
2027                                  proc_bind,
2028 #endif
2029                                  &master_th->th.th_current_task->td_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     }
2032     KF_TRACE(
2033         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2034 
2035     /* setup the new team */
2036     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2037     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2038     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2039     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2040     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2041 #if OMPT_SUPPORT
2042     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2043                           return_address);
2044 #endif
2045     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2046 // TODO: parent_team->t.t_level == INT_MAX ???
2047 #if OMP_40_ENABLED
2048     if (!master_th->th.th_teams_microtask || level > teams_level) {
2049 #endif /* OMP_40_ENABLED */
2050       int new_level = parent_team->t.t_level + 1;
2051       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052       new_level = parent_team->t.t_active_level + 1;
2053       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054 #if OMP_40_ENABLED
2055     } else {
2056       // AC: Do not increase parallel level at start of the teams construct
2057       int new_level = parent_team->t.t_level;
2058       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2059       new_level = parent_team->t.t_active_level;
2060       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2061     }
2062 #endif /* OMP_40_ENABLED */
2063     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2064     // set master's schedule as new run-time schedule
2065     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2066 
2067 #if OMP_40_ENABLED
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069 #endif
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2113           // Restore master's nested state if nested hot team
2114           master_th->th.th_task_state =
2115               master_th->th
2116                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2117         } else {
2118 #endif
2119           master_th->th.th_task_state = 0;
2120 #if KMP_NESTED_HOT_TEAMS
2121         }
2122 #endif
2123       }
2124 #if !KMP_NESTED_HOT_TEAMS
2125       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2126                        (team == root->r.r_hot_team));
2127 #endif
2128     }
2129 
2130     KA_TRACE(
2131         20,
2132         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2133          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2134          team->t.t_nproc));
2135     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2136                      (team->t.t_master_tid == 0 &&
2137                       (team->t.t_parent == root->r.r_root_team ||
2138                        team->t.t_parent->t.t_serialized)));
2139     KMP_MB();
2140 
2141     /* now, setup the arguments */
2142     argv = (void **)team->t.t_argv;
2143 #if OMP_40_ENABLED
2144     if (ap) {
2145 #endif /* OMP_40_ENABLED */
2146       for (i = argc - 1; i >= 0; --i) {
2147 // TODO: revert workaround for Intel(R) 64 tracker #96
2148 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2149         void *new_argv = va_arg(*ap, void *);
2150 #else
2151       void *new_argv = va_arg(ap, void *);
2152 #endif
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156 #if OMP_40_ENABLED
2157     } else {
2158       for (i = 0; i < argc; ++i) {
2159         // Get args from parent team for teams construct
2160         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2161       }
2162     }
2163 #endif /* OMP_40_ENABLED */
2164 
2165     /* now actually fork the threads */
2166     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2167     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2168       root->r.r_active = TRUE;
2169 
2170     __kmp_fork_team_threads(root, team, master_th, gtid);
2171     __kmp_setup_icv_copy(team, nthreads,
2172                          &master_th->th.th_current_task->td_icvs, loc);
2173 
2174 #if OMPT_SUPPORT
2175     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2176 #endif
2177 
2178     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2179 
2180 #if USE_ITT_BUILD
2181     if (team->t.t_active_level == 1 // only report frames at level 1
2182 #if OMP_40_ENABLED
2183         && !master_th->th.th_teams_microtask // not in teams construct
2184 #endif /* OMP_40_ENABLED */
2185         ) {
2186 #if USE_ITT_NOTIFY
2187       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2188           (__kmp_forkjoin_frames_mode == 3 ||
2189            __kmp_forkjoin_frames_mode == 1)) {
2190         kmp_uint64 tmp_time = 0;
2191         if (__itt_get_timestamp_ptr)
2192           tmp_time = __itt_get_timestamp();
2193         // Internal fork - report frame begin
2194         master_th->th.th_frame_time = tmp_time;
2195         if (__kmp_forkjoin_frames_mode == 3)
2196           team->t.t_region_time = tmp_time;
2197       } else
2198 // only one notification scheme (either "submit" or "forking/joined", not both)
2199 #endif /* USE_ITT_NOTIFY */
2200           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2201               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2202         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2203         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2204       }
2205     }
2206 #endif /* USE_ITT_BUILD */
2207 
2208     /* now go on and do the work */
2209     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2210     KMP_MB();
2211     KF_TRACE(10,
2212              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2213               root, team, master_th, gtid));
2214 
2215 #if USE_ITT_BUILD
2216     if (__itt_stack_caller_create_ptr) {
2217       team->t.t_stack_id =
2218           __kmp_itt_stack_caller_create(); // create new stack stitching id
2219       // before entering fork barrier
2220     }
2221 #endif /* USE_ITT_BUILD */
2222 
2223 #if OMP_40_ENABLED
2224     // AC: skip __kmp_internal_fork at teams construct, let only master
2225     // threads execute
2226     if (ap)
2227 #endif /* OMP_40_ENABLED */
2228     {
2229       __kmp_internal_fork(loc, gtid, team);
2230       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2231                     "master_th=%p, gtid=%d\n",
2232                     root, team, master_th, gtid));
2233     }
2234 
2235     if (call_context == fork_context_gnu) {
2236       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2237       return TRUE;
2238     }
2239 
2240     /* Invoke microtask for MASTER thread */
2241     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2242                   team->t.t_id, team->t.t_pkfn));
2243   } // END of timer KMP_fork_call block
2244 
2245   {
2246     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2247     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2248     if (!team->t.t_invoke(gtid)) {
2249       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2250     }
2251   }
2252   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253                 team->t.t_id, team->t.t_pkfn));
2254   KMP_MB(); /* Flush all pending memory write invalidates.  */
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259   if (ompt_enabled.enabled) {
2260     master_th->th.ompt_thread_info.state = omp_state_overhead;
2261   }
2262 #endif
2263 
2264   return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269                                             kmp_team_t *team) {
2270   // restore state outside the region
2271   thread->th.ompt_thread_info.state =
2272       ((team->t.t_serialized) ? omp_state_work_serial
2273                               : omp_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277                                    kmp_team_t *team, ompt_data_t *parallel_data,
2278                                    fork_context_e fork_context, void *codeptr) {
2279   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280   if (ompt_enabled.ompt_callback_parallel_end) {
2281     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2283         codeptr);
2284   }
2285 
2286   task_info->frame.enter_frame = NULL;
2287   __kmp_join_restore_state(thread, team);
2288 }
2289 #endif
2290 
2291 void __kmp_join_call(ident_t *loc, int gtid
2292 #if OMPT_SUPPORT
2293                      ,
2294                      enum fork_context_e fork_context
2295 #endif
2296 #if OMP_40_ENABLED
2297                      ,
2298                      int exit_teams
2299 #endif /* OMP_40_ENABLED */
2300                      ) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307   int i;
2308 
2309   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2310 
2311   /* setup current data */
2312   master_th = __kmp_threads[gtid];
2313   root = master_th->th.th_root;
2314   team = master_th->th.th_team;
2315   parent_team = team->t.t_parent;
2316 
2317   master_th->th.th_ident = loc;
2318 
2319 #if OMPT_SUPPORT
2320   if (ompt_enabled.enabled) {
2321     master_th->th.ompt_thread_info.state = omp_state_overhead;
2322   }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328                   "th_task_team = %p\n",
2329                   __kmp_gtid_from_thread(master_th), team,
2330                   team->t.t_task_team[master_th->th.th_task_state],
2331                   master_th->th.th_task_team));
2332     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333                      team->t.t_task_team[master_th->th.th_task_state]);
2334   }
2335 #endif
2336 
2337   if (team->t.t_serialized) {
2338 #if OMP_40_ENABLED
2339     if (master_th->th.th_teams_microtask) {
2340       // We are in teams construct
2341       int level = team->t.t_level;
2342       int tlevel = master_th->th.th_teams_level;
2343       if (level == tlevel) {
2344         // AC: we haven't incremented it earlier at start of teams construct,
2345         //     so do it here - at the end of teams construct
2346         team->t.t_level++;
2347       } else if (level == tlevel + 1) {
2348         // AC: we are exiting parallel inside teams, need to increment
2349         // serialization in order to restore it in the next call to
2350         // __kmpc_end_serialized_parallel
2351         team->t.t_serialized++;
2352       }
2353     }
2354 #endif /* OMP_40_ENABLED */
2355     __kmpc_end_serialized_parallel(loc, gtid);
2356 
2357 #if OMPT_SUPPORT
2358     if (ompt_enabled.enabled) {
2359       __kmp_join_restore_state(master_th, parent_team);
2360     }
2361 #endif
2362 
2363     return;
2364   }
2365 
2366   master_active = team->t.t_master_active;
2367 
2368 #if OMP_40_ENABLED
2369   if (!exit_teams)
2370 #endif /* OMP_40_ENABLED */
2371   {
2372     // AC: No barrier for internal teams at exit from teams construct.
2373     //     But there is barrier for external team (league).
2374     __kmp_internal_join(loc, gtid, team);
2375   }
2376 #if OMP_40_ENABLED
2377   else {
2378     master_th->th.th_task_state =
2379         0; // AC: no tasking in teams (out of any parallel)
2380   }
2381 #endif /* OMP_40_ENABLED */
2382 
2383   KMP_MB();
2384 
2385 #if OMPT_SUPPORT
2386   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2387   void *codeptr = team->t.ompt_team_info.master_return_address;
2388 #endif
2389 
2390 #if USE_ITT_BUILD
2391   if (__itt_stack_caller_create_ptr) {
2392     __kmp_itt_stack_caller_destroy(
2393         (__itt_caller)team->t
2394             .t_stack_id); // destroy the stack stitching id after join barrier
2395   }
2396 
2397   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2398   if (team->t.t_active_level == 1
2399 #if OMP_40_ENABLED
2400       && !master_th->th.th_teams_microtask /* not in teams construct */
2401 #endif /* OMP_40_ENABLED */
2402       ) {
2403     master_th->th.th_ident = loc;
2404     // only one notification scheme (either "submit" or "forking/joined", not
2405     // both)
2406     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2407         __kmp_forkjoin_frames_mode == 3)
2408       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2409                              master_th->th.th_frame_time, 0, loc,
2410                              master_th->th.th_team_nproc, 1);
2411     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2412              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2413       __kmp_itt_region_joined(gtid);
2414   } // active_level == 1
2415 #endif /* USE_ITT_BUILD */
2416 
2417 #if OMP_40_ENABLED
2418   if (master_th->th.th_teams_microtask && !exit_teams &&
2419       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2420       team->t.t_level == master_th->th.th_teams_level + 1) {
2421     // AC: We need to leave the team structure intact at the end of parallel
2422     // inside the teams construct, so that at the next parallel same (hot) team
2423     // works, only adjust nesting levels
2424 
2425     /* Decrement our nested depth level */
2426     team->t.t_level--;
2427     team->t.t_active_level--;
2428     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2429 
2430     /* Restore number of threads in the team if needed */
2431     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2432       int old_num = master_th->th.th_team_nproc;
2433       int new_num = master_th->th.th_teams_size.nth;
2434       kmp_info_t **other_threads = team->t.t_threads;
2435       team->t.t_nproc = new_num;
2436       for (i = 0; i < old_num; ++i) {
2437         other_threads[i]->th.th_team_nproc = new_num;
2438       }
2439       // Adjust states of non-used threads of the team
2440       for (i = old_num; i < new_num; ++i) {
2441         // Re-initialize thread's barrier data.
2442         int b;
2443         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2444         for (b = 0; b < bs_last_barrier; ++b) {
2445           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2446           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2447 #if USE_DEBUGGER
2448           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2449 #endif
2450         }
2451         if (__kmp_tasking_mode != tskm_immediate_exec) {
2452           // Synchronize thread's task state
2453           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2454         }
2455       }
2456     }
2457 
2458 #if OMPT_SUPPORT
2459     if (ompt_enabled.enabled) {
2460       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2461                       codeptr);
2462     }
2463 #endif
2464 
2465     return;
2466   }
2467 #endif /* OMP_40_ENABLED */
2468 
2469   /* do cleanup and restore the parent team */
2470   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2471   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2472 
2473   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2474 
2475   /* jc: The following lock has instructions with REL and ACQ semantics,
2476      separating the parallel user code called in this parallel region
2477      from the serial user code called after this function returns. */
2478   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2479 
2480 #if OMP_40_ENABLED
2481   if (!master_th->th.th_teams_microtask ||
2482       team->t.t_level > master_th->th.th_teams_level)
2483 #endif /* OMP_40_ENABLED */
2484   {
2485     /* Decrement our nested depth level */
2486     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int ompt_team_size = team->t.t_nproc;
2495       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2496           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2497           __kmp_tid_from_gtid(gtid));
2498     }
2499 
2500     task_info->frame.exit_frame = NULL;
2501     task_info->task_data = ompt_data_none;
2502   }
2503 #endif
2504 
2505   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2506                 master_th, team));
2507   __kmp_pop_current_task_from_thread(master_th);
2508 
2509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2510   // Restore master thread's partition.
2511   master_th->th.th_first_place = team->t.t_first_place;
2512   master_th->th.th_last_place = team->t.t_last_place;
2513 #endif /* OMP_40_ENABLED */
2514 
2515   updateHWFPControl(team);
2516 
2517   if (root->r.r_active != master_active)
2518     root->r.r_active = master_active;
2519 
2520   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2521                             master_th)); // this will free worker threads
2522 
2523   /* this race was fun to find. make sure the following is in the critical
2524      region otherwise assertions may fail occasionally since the old team may be
2525      reallocated and the hierarchy appears inconsistent. it is actually safe to
2526      run and won't cause any bugs, but will cause those assertion failures. it's
2527      only one deref&assign so might as well put this in the critical region */
2528   master_th->th.th_team = parent_team;
2529   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2530   master_th->th.th_team_master = parent_team->t.t_threads[0];
2531   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2532 
2533   /* restore serialized team, if need be */
2534   if (parent_team->t.t_serialized &&
2535       parent_team != master_th->th.th_serial_team &&
2536       parent_team != root->r.r_root_team) {
2537     __kmp_free_team(root,
2538                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2539     master_th->th.th_serial_team = parent_team;
2540   }
2541 
2542   if (__kmp_tasking_mode != tskm_immediate_exec) {
2543     if (master_th->th.th_task_state_top >
2544         0) { // Restore task state from memo stack
2545       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2546       // Remember master's state if we re-use this nested hot team
2547       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2548           master_th->th.th_task_state;
2549       --master_th->th.th_task_state_top; // pop
2550       // Now restore state at this level
2551       master_th->th.th_task_state =
2552           master_th->th
2553               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2554     }
2555     // Copy the task team from the parent team to the master thread
2556     master_th->th.th_task_team =
2557         parent_team->t.t_task_team[master_th->th.th_task_state];
2558     KA_TRACE(20,
2559              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2560               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2561               parent_team));
2562   }
2563 
2564   // TODO: GEH - cannot do this assertion because root thread not set up as
2565   // executing
2566   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2567   master_th->th.th_current_task->td_flags.executing = 1;
2568 
2569   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2570 
2571 #if OMPT_SUPPORT
2572   if (ompt_enabled.enabled) {
2573     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2574                     codeptr);
2575   }
2576 #endif
2577 
2578   KMP_MB();
2579   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2580 }
2581 
2582 /* Check whether we should push an internal control record onto the
2583    serial team stack.  If so, do it.  */
2584 void __kmp_save_internal_controls(kmp_info_t *thread) {
2585 
2586   if (thread->th.th_team != thread->th.th_serial_team) {
2587     return;
2588   }
2589   if (thread->th.th_team->t.t_serialized > 1) {
2590     int push = 0;
2591 
2592     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2593       push = 1;
2594     } else {
2595       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2596           thread->th.th_team->t.t_serialized) {
2597         push = 1;
2598       }
2599     }
2600     if (push) { /* push a record on the serial team's stack */
2601       kmp_internal_control_t *control =
2602           (kmp_internal_control_t *)__kmp_allocate(
2603               sizeof(kmp_internal_control_t));
2604 
2605       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2606 
2607       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2608 
2609       control->next = thread->th.th_team->t.t_control_stack_top;
2610       thread->th.th_team->t.t_control_stack_top = control;
2611     }
2612   }
2613 }
2614 
2615 /* Changes set_nproc */
2616 void __kmp_set_num_threads(int new_nth, int gtid) {
2617   kmp_info_t *thread;
2618   kmp_root_t *root;
2619 
2620   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2621   KMP_DEBUG_ASSERT(__kmp_init_serial);
2622 
2623   if (new_nth < 1)
2624     new_nth = 1;
2625   else if (new_nth > __kmp_max_nth)
2626     new_nth = __kmp_max_nth;
2627 
2628   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2629   thread = __kmp_threads[gtid];
2630 
2631   __kmp_save_internal_controls(thread);
2632 
2633   set__nproc(thread, new_nth);
2634 
2635   // If this omp_set_num_threads() call will cause the hot team size to be
2636   // reduced (in the absence of a num_threads clause), then reduce it now,
2637   // rather than waiting for the next parallel region.
2638   root = thread->th.th_root;
2639   if (__kmp_init_parallel && (!root->r.r_active) &&
2640       (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644       ) {
2645     kmp_team_t *hot_team = root->r.r_hot_team;
2646     int f;
2647 
2648     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650     // Release the extra threads we don't need any more.
2651     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653       if (__kmp_tasking_mode != tskm_immediate_exec) {
2654         // When decreasing team size, threads no longer in the team should unref
2655         // task team.
2656         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657       }
2658       __kmp_free_thread(hot_team->t.t_threads[f]);
2659       hot_team->t.t_threads[f] = NULL;
2660     }
2661     hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663     if (thread->th.th_hot_teams) {
2664       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666     }
2667 #endif
2668 
2669     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671     // Update the t_nproc field in the threads that are still active.
2672     for (f = 0; f < new_nth; f++) {
2673       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675     }
2676     // Special flag in case omp_set_num_threads() call
2677     hot_team->t.t_size_changed = -1;
2678   }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683   kmp_info_t *thread;
2684 
2685   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686                 "%d = (%d)\n",
2687                 gtid, max_active_levels));
2688   KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690   // validate max_active_levels
2691   if (max_active_levels < 0) {
2692     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693     // We ignore this call if the user has specified a negative value.
2694     // The current setting won't be changed. The last valid setting will be
2695     // used. A warning will be issued (if warnings are allowed as controlled by
2696     // the KMP_WARNINGS env var).
2697     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698                   "max_active_levels for thread %d = (%d)\n",
2699                   gtid, max_active_levels));
2700     return;
2701   }
2702   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703     // it's OK, the max_active_levels is within the valid range: [ 0;
2704     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705     // We allow a zero value. (implementation defined behavior)
2706   } else {
2707     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710     // Current upper limit is MAX_INT. (implementation defined behavior)
2711     // If the input exceeds the upper limit, we correct the input to be the
2712     // upper limit. (implementation defined behavior)
2713     // Actually, the flow should never get here until we use MAX_INT limit.
2714   }
2715   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716                 "max_active_levels for thread %d = (%d)\n",
2717                 gtid, max_active_levels));
2718 
2719   thread = __kmp_threads[gtid];
2720 
2721   __kmp_save_internal_controls(thread);
2722 
2723   set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728   kmp_info_t *thread;
2729 
2730   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731   KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733   thread = __kmp_threads[gtid];
2734   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736                 "curtask_maxaclevel=%d\n",
2737                 gtid, thread->th.th_current_task,
2738                 thread->th.th_current_task->td_icvs.max_active_levels));
2739   return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744   kmp_info_t *thread;
2745   //    kmp_team_t *team;
2746 
2747   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2748                 gtid, (int)kind, chunk));
2749   KMP_DEBUG_ASSERT(__kmp_init_serial);
2750 
2751   // Check if the kind parameter is valid, correct if needed.
2752   // Valid parameters should fit in one of two intervals - standard or extended:
2753   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2754   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2755   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2756       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2757     // TODO: Hint needs attention in case we change the default schedule.
2758     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2759               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2760               __kmp_msg_null);
2761     kind = kmp_sched_default;
2762     chunk = 0; // ignore chunk value in case of bad kind
2763   }
2764 
2765   thread = __kmp_threads[gtid];
2766 
2767   __kmp_save_internal_controls(thread);
2768 
2769   if (kind < kmp_sched_upper_std) {
2770     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2771       // differ static chunked vs. unchunked:  chunk should be invalid to
2772       // indicate unchunked schedule (which is the default)
2773       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2774     } else {
2775       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2776           __kmp_sch_map[kind - kmp_sched_lower - 1];
2777     }
2778   } else {
2779     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2780     //    kmp_sched_lower - 2 ];
2781     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2782         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2783                       kmp_sched_lower - 2];
2784   }
2785   if (kind == kmp_sched_auto || chunk < 1) {
2786     // ignore parameter chunk for schedule auto
2787     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2788   } else {
2789     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2790   }
2791 }
2792 
2793 /* Gets def_sched_var ICV values */
2794 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2795   kmp_info_t *thread;
2796   enum sched_type th_type;
2797 
2798   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2799   KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801   thread = __kmp_threads[gtid];
2802 
2803   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2804 
2805   switch (th_type) {
2806   case kmp_sch_static:
2807   case kmp_sch_static_greedy:
2808   case kmp_sch_static_balanced:
2809     *kind = kmp_sched_static;
2810     *chunk = 0; // chunk was not set, try to show this fact via zero value
2811     return;
2812   case kmp_sch_static_chunked:
2813     *kind = kmp_sched_static;
2814     break;
2815   case kmp_sch_dynamic_chunked:
2816     *kind = kmp_sched_dynamic;
2817     break;
2818   case kmp_sch_guided_chunked:
2819   case kmp_sch_guided_iterative_chunked:
2820   case kmp_sch_guided_analytical_chunked:
2821     *kind = kmp_sched_guided;
2822     break;
2823   case kmp_sch_auto:
2824     *kind = kmp_sched_auto;
2825     break;
2826   case kmp_sch_trapezoidal:
2827     *kind = kmp_sched_trapezoidal;
2828     break;
2829 #if KMP_STATIC_STEAL_ENABLED
2830   case kmp_sch_static_steal:
2831     *kind = kmp_sched_static_steal;
2832     break;
2833 #endif
2834   default:
2835     KMP_FATAL(UnknownSchedulingType, th_type);
2836   }
2837 
2838   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2839 }
2840 
2841 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2842 
2843   int ii, dd;
2844   kmp_team_t *team;
2845   kmp_info_t *thr;
2846 
2847   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2848   KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850   // validate level
2851   if (level == 0)
2852     return 0;
2853   if (level < 0)
2854     return -1;
2855   thr = __kmp_threads[gtid];
2856   team = thr->th.th_team;
2857   ii = team->t.t_level;
2858   if (level > ii)
2859     return -1;
2860 
2861 #if OMP_40_ENABLED
2862   if (thr->th.th_teams_microtask) {
2863     // AC: we are in teams region where multiple nested teams have same level
2864     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2865     if (level <=
2866         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2867       KMP_DEBUG_ASSERT(ii >= tlevel);
2868       // AC: As we need to pass by the teams league, we need to artificially
2869       // increase ii
2870       if (ii == tlevel) {
2871         ii += 2; // three teams have same level
2872       } else {
2873         ii++; // two teams have same level
2874       }
2875     }
2876   }
2877 #endif
2878 
2879   if (ii == level)
2880     return __kmp_tid_from_gtid(gtid);
2881 
2882   dd = team->t.t_serialized;
2883   level++;
2884   while (ii > level) {
2885     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2886     }
2887     if ((team->t.t_serialized) && (!dd)) {
2888       team = team->t.t_parent;
2889       continue;
2890     }
2891     if (ii > level) {
2892       team = team->t.t_parent;
2893       dd = team->t.t_serialized;
2894       ii--;
2895     }
2896   }
2897 
2898   return (dd > 1) ? (0) : (team->t.t_master_tid);
2899 }
2900 
2901 int __kmp_get_team_size(int gtid, int level) {
2902 
2903   int ii, dd;
2904   kmp_team_t *team;
2905   kmp_info_t *thr;
2906 
2907   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2908   KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910   // validate level
2911   if (level == 0)
2912     return 1;
2913   if (level < 0)
2914     return -1;
2915   thr = __kmp_threads[gtid];
2916   team = thr->th.th_team;
2917   ii = team->t.t_level;
2918   if (level > ii)
2919     return -1;
2920 
2921 #if OMP_40_ENABLED
2922   if (thr->th.th_teams_microtask) {
2923     // AC: we are in teams region where multiple nested teams have same level
2924     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2925     if (level <=
2926         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2927       KMP_DEBUG_ASSERT(ii >= tlevel);
2928       // AC: As we need to pass by the teams league, we need to artificially
2929       // increase ii
2930       if (ii == tlevel) {
2931         ii += 2; // three teams have same level
2932       } else {
2933         ii++; // two teams have same level
2934       }
2935     }
2936   }
2937 #endif
2938 
2939   while (ii > level) {
2940     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2941     }
2942     if (team->t.t_serialized && (!dd)) {
2943       team = team->t.t_parent;
2944       continue;
2945     }
2946     if (ii > level) {
2947       team = team->t.t_parent;
2948       ii--;
2949     }
2950   }
2951 
2952   return team->t.t_nproc;
2953 }
2954 
2955 kmp_r_sched_t __kmp_get_schedule_global() {
2956   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2957   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2958   // independently. So one can get the updated schedule here.
2959 
2960   kmp_r_sched_t r_sched;
2961 
2962   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2963   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2964   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2965   // different roots (even in OMP 2.5)
2966   if (__kmp_sched == kmp_sch_static) {
2967     // replace STATIC with more detailed schedule (balanced or greedy)
2968     r_sched.r_sched_type = __kmp_static;
2969   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2970     // replace GUIDED with more detailed schedule (iterative or analytical)
2971     r_sched.r_sched_type = __kmp_guided;
2972   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2973     r_sched.r_sched_type = __kmp_sched;
2974   }
2975 
2976   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2977     // __kmp_chunk may be wrong here (if it was not ever set)
2978     r_sched.chunk = KMP_DEFAULT_CHUNK;
2979   } else {
2980     r_sched.chunk = __kmp_chunk;
2981   }
2982 
2983   return r_sched;
2984 }
2985 
2986 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2987    at least argc number of *t_argv entries for the requested team. */
2988 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2989 
2990   KMP_DEBUG_ASSERT(team);
2991   if (!realloc || argc > team->t.t_max_argc) {
2992 
2993     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2994                    "current entries=%d\n",
2995                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2996     /* if previously allocated heap space for args, free them */
2997     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2998       __kmp_free((void *)team->t.t_argv);
2999 
3000     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3001       /* use unused space in the cache line for arguments */
3002       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3003       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3004                      "argv entries\n",
3005                      team->t.t_id, team->t.t_max_argc));
3006       team->t.t_argv = &team->t.t_inline_argv[0];
3007       if (__kmp_storage_map) {
3008         __kmp_print_storage_map_gtid(
3009             -1, &team->t.t_inline_argv[0],
3010             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3011             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3012             team->t.t_id);
3013       }
3014     } else {
3015       /* allocate space for arguments in the heap */
3016       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3017                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3018                                : 2 * argc;
3019       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3020                      "argv entries\n",
3021                      team->t.t_id, team->t.t_max_argc));
3022       team->t.t_argv =
3023           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3024       if (__kmp_storage_map) {
3025         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3026                                      &team->t.t_argv[team->t.t_max_argc],
3027                                      sizeof(void *) * team->t.t_max_argc,
3028                                      "team_%d.t_argv", team->t.t_id);
3029       }
3030     }
3031   }
3032 }
3033 
3034 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3035   int i;
3036   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3037   team->t.t_threads =
3038       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3039   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3040       sizeof(dispatch_shared_info_t) * num_disp_buff);
3041   team->t.t_dispatch =
3042       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3043   team->t.t_implicit_task_taskdata =
3044       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3045   team->t.t_max_nproc = max_nth;
3046 
3047   /* setup dispatch buffers */
3048   for (i = 0; i < num_disp_buff; ++i) {
3049     team->t.t_disp_buffer[i].buffer_index = i;
3050 #if OMP_45_ENABLED
3051     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3052 #endif
3053   }
3054 }
3055 
3056 static void __kmp_free_team_arrays(kmp_team_t *team) {
3057   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3058   int i;
3059   for (i = 0; i < team->t.t_max_nproc; ++i) {
3060     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3061       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3062       team->t.t_dispatch[i].th_disp_buffer = NULL;
3063     }
3064   }
3065   __kmp_free(team->t.t_threads);
3066   __kmp_free(team->t.t_disp_buffer);
3067   __kmp_free(team->t.t_dispatch);
3068   __kmp_free(team->t.t_implicit_task_taskdata);
3069   team->t.t_threads = NULL;
3070   team->t.t_disp_buffer = NULL;
3071   team->t.t_dispatch = NULL;
3072   team->t.t_implicit_task_taskdata = 0;
3073 }
3074 
3075 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3076   kmp_info_t **oldThreads = team->t.t_threads;
3077 
3078   __kmp_free(team->t.t_disp_buffer);
3079   __kmp_free(team->t.t_dispatch);
3080   __kmp_free(team->t.t_implicit_task_taskdata);
3081   __kmp_allocate_team_arrays(team, max_nth);
3082 
3083   KMP_MEMCPY(team->t.t_threads, oldThreads,
3084              team->t.t_nproc * sizeof(kmp_info_t *));
3085 
3086   __kmp_free(oldThreads);
3087 }
3088 
3089 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3090 
3091   kmp_r_sched_t r_sched =
3092       __kmp_get_schedule_global(); // get current state of scheduling globals
3093 
3094 #if OMP_40_ENABLED
3095   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3096 #endif /* OMP_40_ENABLED */
3097 
3098   kmp_internal_control_t g_icvs = {
3099     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3100     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3101     // for nested parallelism (per thread)
3102     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3103     // adjustment of threads (per thread)
3104     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3105     // whether blocktime is explicitly set
3106     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3107 #if KMP_USE_MONITOR
3108     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3109 // intervals
3110 #endif
3111     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3112     // next parallel region (per thread)
3113     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3114     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3115     // for max_active_levels
3116     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3117 // {sched,chunk} pair
3118 #if OMP_40_ENABLED
3119     __kmp_nested_proc_bind.bind_types[0],
3120     __kmp_default_device,
3121 #endif /* OMP_40_ENABLED */
3122     NULL // struct kmp_internal_control *next;
3123   };
3124 
3125   return g_icvs;
3126 }
3127 
3128 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3129 
3130   kmp_internal_control_t gx_icvs;
3131   gx_icvs.serial_nesting_level =
3132       0; // probably =team->t.t_serial like in save_inter_controls
3133   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3134   gx_icvs.next = NULL;
3135 
3136   return gx_icvs;
3137 }
3138 
3139 static void __kmp_initialize_root(kmp_root_t *root) {
3140   int f;
3141   kmp_team_t *root_team;
3142   kmp_team_t *hot_team;
3143   int hot_team_max_nth;
3144   kmp_r_sched_t r_sched =
3145       __kmp_get_schedule_global(); // get current state of scheduling globals
3146   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3147   KMP_DEBUG_ASSERT(root);
3148   KMP_ASSERT(!root->r.r_begin);
3149 
3150   /* setup the root state structure */
3151   __kmp_init_lock(&root->r.r_begin_lock);
3152   root->r.r_begin = FALSE;
3153   root->r.r_active = FALSE;
3154   root->r.r_in_parallel = 0;
3155   root->r.r_blocktime = __kmp_dflt_blocktime;
3156   root->r.r_nested = __kmp_dflt_nested;
3157   root->r.r_cg_nthreads = 1;
3158 
3159   /* setup the root team for this task */
3160   /* allocate the root team structure */
3161   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163   root_team =
3164       __kmp_allocate_team(root,
3165                           1, // new_nproc
3166                           1, // max_nproc
3167 #if OMPT_SUPPORT
3168                           ompt_data_none, // root parallel id
3169 #endif
3170 #if OMP_40_ENABLED
3171                           __kmp_nested_proc_bind.bind_types[0],
3172 #endif
3173                           &r_icvs,
3174                           0 // argc
3175                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3176                           );
3177 #if USE_DEBUGGER
3178   // Non-NULL value should be assigned to make the debugger display the root
3179   // team.
3180   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3181 #endif
3182 
3183   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3184 
3185   root->r.r_root_team = root_team;
3186   root_team->t.t_control_stack_top = NULL;
3187 
3188   /* initialize root team */
3189   root_team->t.t_threads[0] = NULL;
3190   root_team->t.t_nproc = 1;
3191   root_team->t.t_serialized = 1;
3192   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3193   root_team->t.t_sched.sched = r_sched.sched;
3194   KA_TRACE(
3195       20,
3196       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3197        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3198 
3199   /* setup the  hot team for this task */
3200   /* allocate the hot team structure */
3201   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3202 
3203   hot_team =
3204       __kmp_allocate_team(root,
3205                           1, // new_nproc
3206                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3207 #if OMPT_SUPPORT
3208                           ompt_data_none, // root parallel id
3209 #endif
3210 #if OMP_40_ENABLED
3211                           __kmp_nested_proc_bind.bind_types[0],
3212 #endif
3213                           &r_icvs,
3214                           0 // argc
3215                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3216                           );
3217   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3218 
3219   root->r.r_hot_team = hot_team;
3220   root_team->t.t_control_stack_top = NULL;
3221 
3222   /* first-time initialization */
3223   hot_team->t.t_parent = root_team;
3224 
3225   /* initialize hot team */
3226   hot_team_max_nth = hot_team->t.t_max_nproc;
3227   for (f = 0; f < hot_team_max_nth; ++f) {
3228     hot_team->t.t_threads[f] = NULL;
3229   }
3230   hot_team->t.t_nproc = 1;
3231   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3232   hot_team->t.t_sched.sched = r_sched.sched;
3233   hot_team->t.t_size_changed = 0;
3234 }
3235 
3236 #ifdef KMP_DEBUG
3237 
3238 typedef struct kmp_team_list_item {
3239   kmp_team_p const *entry;
3240   struct kmp_team_list_item *next;
3241 } kmp_team_list_item_t;
3242 typedef kmp_team_list_item_t *kmp_team_list_t;
3243 
3244 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3245     kmp_team_list_t list, // List of teams.
3246     kmp_team_p const *team // Team to add.
3247     ) {
3248 
3249   // List must terminate with item where both entry and next are NULL.
3250   // Team is added to the list only once.
3251   // List is sorted in ascending order by team id.
3252   // Team id is *not* a key.
3253 
3254   kmp_team_list_t l;
3255 
3256   KMP_DEBUG_ASSERT(list != NULL);
3257   if (team == NULL) {
3258     return;
3259   }
3260 
3261   __kmp_print_structure_team_accum(list, team->t.t_parent);
3262   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3263 
3264   // Search list for the team.
3265   l = list;
3266   while (l->next != NULL && l->entry != team) {
3267     l = l->next;
3268   }
3269   if (l->next != NULL) {
3270     return; // Team has been added before, exit.
3271   }
3272 
3273   // Team is not found. Search list again for insertion point.
3274   l = list;
3275   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3276     l = l->next;
3277   }
3278 
3279   // Insert team.
3280   {
3281     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3282         sizeof(kmp_team_list_item_t));
3283     *item = *l;
3284     l->entry = team;
3285     l->next = item;
3286   }
3287 }
3288 
3289 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3290 
3291                                        ) {
3292   __kmp_printf("%s", title);
3293   if (team != NULL) {
3294     __kmp_printf("%2x %p\n", team->t.t_id, team);
3295   } else {
3296     __kmp_printf(" - (nil)\n");
3297   }
3298 }
3299 
3300 static void __kmp_print_structure_thread(char const *title,
3301                                          kmp_info_p const *thread) {
3302   __kmp_printf("%s", title);
3303   if (thread != NULL) {
3304     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3305   } else {
3306     __kmp_printf(" - (nil)\n");
3307   }
3308 }
3309 
3310 void __kmp_print_structure(void) {
3311 
3312   kmp_team_list_t list;
3313 
3314   // Initialize list of teams.
3315   list =
3316       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3317   list->entry = NULL;
3318   list->next = NULL;
3319 
3320   __kmp_printf("\n------------------------------\nGlobal Thread "
3321                "Table\n------------------------------\n");
3322   {
3323     int gtid;
3324     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325       __kmp_printf("%2d", gtid);
3326       if (__kmp_threads != NULL) {
3327         __kmp_printf(" %p", __kmp_threads[gtid]);
3328       }
3329       if (__kmp_root != NULL) {
3330         __kmp_printf(" %p", __kmp_root[gtid]);
3331       }
3332       __kmp_printf("\n");
3333     }
3334   }
3335 
3336   // Print out __kmp_threads array.
3337   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3338                "----------\n");
3339   if (__kmp_threads != NULL) {
3340     int gtid;
3341     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342       kmp_info_t const *thread = __kmp_threads[gtid];
3343       if (thread != NULL) {
3344         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3345         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3346         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3347         __kmp_print_structure_team("    Serial Team:  ",
3348                                    thread->th.th_serial_team);
3349         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3350         __kmp_print_structure_thread("    Master:       ",
3351                                      thread->th.th_team_master);
3352         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3353         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3354 #if OMP_40_ENABLED
3355         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3356 #endif
3357         __kmp_print_structure_thread("    Next in pool: ",
3358                                      thread->th.th_next_pool);
3359         __kmp_printf("\n");
3360         __kmp_print_structure_team_accum(list, thread->th.th_team);
3361         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3362       }
3363     }
3364   } else {
3365     __kmp_printf("Threads array is not allocated.\n");
3366   }
3367 
3368   // Print out __kmp_root array.
3369   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3370                "--------\n");
3371   if (__kmp_root != NULL) {
3372     int gtid;
3373     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3374       kmp_root_t const *root = __kmp_root[gtid];
3375       if (root != NULL) {
3376         __kmp_printf("GTID %2d %p:\n", gtid, root);
3377         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3378         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3379         __kmp_print_structure_thread("    Uber Thread:  ",
3380                                      root->r.r_uber_thread);
3381         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3382         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3383         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3384         __kmp_printf("\n");
3385         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3386         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3387       }
3388     }
3389   } else {
3390     __kmp_printf("Ubers array is not allocated.\n");
3391   }
3392 
3393   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3394                "--------\n");
3395   while (list->next != NULL) {
3396     kmp_team_p const *team = list->entry;
3397     int i;
3398     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3399     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3400     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3401     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3402     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3403     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3404     for (i = 0; i < team->t.t_nproc; ++i) {
3405       __kmp_printf("    Thread %2d:      ", i);
3406       __kmp_print_structure_thread("", team->t.t_threads[i]);
3407     }
3408     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3409     __kmp_printf("\n");
3410     list = list->next;
3411   }
3412 
3413   // Print out __kmp_thread_pool and __kmp_team_pool.
3414   __kmp_printf("\n------------------------------\nPools\n----------------------"
3415                "--------\n");
3416   __kmp_print_structure_thread("Thread pool:          ",
3417                                CCAST(kmp_info_t *, __kmp_thread_pool));
3418   __kmp_print_structure_team("Team pool:            ",
3419                              CCAST(kmp_team_t *, __kmp_team_pool));
3420   __kmp_printf("\n");
3421 
3422   // Free team list.
3423   while (list != NULL) {
3424     kmp_team_list_item_t *item = list;
3425     list = list->next;
3426     KMP_INTERNAL_FREE(item);
3427   }
3428 }
3429 
3430 #endif
3431 
3432 //---------------------------------------------------------------------------
3433 //  Stuff for per-thread fast random number generator
3434 //  Table of primes
3435 static const unsigned __kmp_primes[] = {
3436     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3437     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3438     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3439     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3440     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3441     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3442     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3443     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3444     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3445     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3446     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3447 
3448 //---------------------------------------------------------------------------
3449 //  __kmp_get_random: Get a random number using a linear congruential method.
3450 unsigned short __kmp_get_random(kmp_info_t *thread) {
3451   unsigned x = thread->th.th_x;
3452   unsigned short r = x >> 16;
3453 
3454   thread->th.th_x = x * thread->th.th_a + 1;
3455 
3456   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3457                 thread->th.th_info.ds.ds_tid, r));
3458 
3459   return r;
3460 }
3461 //--------------------------------------------------------
3462 // __kmp_init_random: Initialize a random number generator
3463 void __kmp_init_random(kmp_info_t *thread) {
3464   unsigned seed = thread->th.th_info.ds.ds_tid;
3465 
3466   thread->th.th_a =
3467       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3468   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3469   KA_TRACE(30,
3470            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3471 }
3472 
3473 #if KMP_OS_WINDOWS
3474 /* reclaim array entries for root threads that are already dead, returns number
3475  * reclaimed */
3476 static int __kmp_reclaim_dead_roots(void) {
3477   int i, r = 0;
3478 
3479   for (i = 0; i < __kmp_threads_capacity; ++i) {
3480     if (KMP_UBER_GTID(i) &&
3481         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3482         !__kmp_root[i]
3483              ->r.r_active) { // AC: reclaim only roots died in non-active state
3484       r += __kmp_unregister_root_other_thread(i);
3485     }
3486   }
3487   return r;
3488 }
3489 #endif
3490 
3491 /* This function attempts to create free entries in __kmp_threads and
3492    __kmp_root, and returns the number of free entries generated.
3493 
3494    For Windows* OS static library, the first mechanism used is to reclaim array
3495    entries for root threads that are already dead.
3496 
3497    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3498    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3499    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3500    threadprivate cache array has been created. Synchronization with
3501    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3502 
3503    After any dead root reclamation, if the clipping value allows array expansion
3504    to result in the generation of a total of nWish free slots, the function does
3505    that expansion. If not, but the clipping value allows array expansion to
3506    result in the generation of a total of nNeed free slots, the function does
3507    that expansion. Otherwise, nothing is done beyond the possible initial root
3508    thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3509    to fulfil nWish as far as possible, i.e. the function will attempt to create
3510    as many free slots as possible up to nWish.
3511 
3512    If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nWish, int nNeed) {
3514   int added = 0;
3515   int old_tp_cached;
3516   int __kmp_actual_max_nth;
3517 
3518   if (nNeed > nWish) /* normalize the arguments */
3519     nWish = nNeed;
3520 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3521   /* only for Windows static library */
3522   /* reclaim array entries for root threads that are already dead */
3523   added = __kmp_reclaim_dead_roots();
3524 
3525   if (nNeed) {
3526     nNeed -= added;
3527     if (nNeed < 0)
3528       nNeed = 0;
3529   }
3530   if (nWish) {
3531     nWish -= added;
3532     if (nWish < 0)
3533       nWish = 0;
3534   }
3535 #endif
3536   if (nWish <= 0)
3537     return added;
3538 
3539   while (1) {
3540     int nTarget;
3541     int minimumRequiredCapacity;
3542     int newCapacity;
3543     kmp_info_t **newThreads;
3544     kmp_root_t **newRoot;
3545 
3546     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3547     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3548     // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3549     // > __kmp_max_nth in one of two ways:
3550     //
3551     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3552     //    may not be resused by another thread, so we may need to increase
3553     //    __kmp_threads_capacity to __kmp_max_nth + 1.
3554     //
3555     // 2) New foreign root(s) are encountered.  We always register new foreign
3556     //    roots. This may cause a smaller # of threads to be allocated at
3557     //    subsequent parallel regions, but the worker threads hang around (and
3558     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3559     //
3560     // Anyway, that is the reason for moving the check to see if
3561     // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3562     // instead of having it performed here. -BB
3563     old_tp_cached = __kmp_tp_cached;
3564     __kmp_actual_max_nth =
3565         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3566     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3567 
3568     /* compute expansion headroom to check if we can expand and whether to aim
3569        for nWish or nNeed */
3570     nTarget = nWish;
3571     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3572       /* can't fulfil nWish, so try nNeed */
3573       if (nNeed) {
3574         nTarget = nNeed;
3575         if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3576           /* possible expansion too small -- give up */
3577           break;
3578         }
3579       } else {
3580         /* best-effort */
3581         nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3582         if (!nTarget) {
3583           /* can expand at all -- give up */
3584           break;
3585         }
3586       }
3587     }
3588     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3589 
3590     newCapacity = __kmp_threads_capacity;
3591     do {
3592       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3593                         ? (newCapacity << 1)
3594                         : __kmp_actual_max_nth;
3595     } while (newCapacity < minimumRequiredCapacity);
3596     newThreads = (kmp_info_t **)__kmp_allocate(
3597         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3598         CACHE_LINE);
3599     newRoot = (kmp_root_t **)((char *)newThreads +
3600                               sizeof(kmp_info_t *) * newCapacity);
3601     KMP_MEMCPY(newThreads, __kmp_threads,
3602                __kmp_threads_capacity * sizeof(kmp_info_t *));
3603     KMP_MEMCPY(newRoot, __kmp_root,
3604                __kmp_threads_capacity * sizeof(kmp_root_t *));
3605     memset(newThreads + __kmp_threads_capacity, 0,
3606            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3607     memset(newRoot + __kmp_threads_capacity, 0,
3608            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3609 
3610     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3611       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3612          allocated a threadprivate cache while we were allocating the expanded
3613          array, and our new capacity is larger than the threadprivate cache
3614          capacity, so we should deallocate the expanded arrays and try again.
3615          This is the first check of a double-check pair. */
3616       __kmp_free(newThreads);
3617       continue; /* start over and try again */
3618     }
3619     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3620     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3621       /* Same check as above, but this time with the lock so we can be sure if
3622          we can succeed. */
3623       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3624       __kmp_free(newThreads);
3625       continue; /* start over and try again */
3626     } else {
3627       /* success */
3628       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3629       // investigated.
3630       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3631       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3632       added += newCapacity - __kmp_threads_capacity;
3633       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3634       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3635       break; /* succeeded, so we can exit the loop */
3636     }
3637   }
3638   return added;
3639 }
3640 
3641 /* Register the current thread as a root thread and obtain our gtid. We must
3642    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3643    thread that calls from __kmp_do_serial_initialize() */
3644 int __kmp_register_root(int initial_thread) {
3645   kmp_info_t *root_thread;
3646   kmp_root_t *root;
3647   int gtid;
3648   int capacity;
3649   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3650   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3651   KMP_MB();
3652 
3653   /* 2007-03-02:
3654      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3655      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3656      work as expected -- it may return false (that means there is at least one
3657      empty slot in __kmp_threads array), but it is possible the only free slot
3658      is #0, which is reserved for initial thread and so cannot be used for this
3659      one. Following code workarounds this bug.
3660 
3661      However, right solution seems to be not reserving slot #0 for initial
3662      thread because:
3663      (1) there is no magic in slot #0,
3664      (2) we cannot detect initial thread reliably (the first thread which does
3665         serial initialization may be not a real initial thread).
3666   */
3667   capacity = __kmp_threads_capacity;
3668   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3669     --capacity;
3670   }
3671 
3672   /* see if there are too many threads */
3673   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3674     if (__kmp_tp_cached) {
3675       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3676                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3677                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3678     } else {
3679       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3680                   __kmp_msg_null);
3681     }
3682   }
3683 
3684   /* find an available thread slot */
3685   /* Don't reassign the zero slot since we need that to only be used by initial
3686      thread */
3687   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3688        gtid++)
3689     ;
3690   KA_TRACE(1,
3691            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3692   KMP_ASSERT(gtid < __kmp_threads_capacity);
3693 
3694   /* update global accounting */
3695   __kmp_all_nth++;
3696   TCW_4(__kmp_nth, __kmp_nth + 1);
3697 
3698   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3699   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3700   if (__kmp_adjust_gtid_mode) {
3701     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3702       if (TCR_4(__kmp_gtid_mode) != 2) {
3703         TCW_4(__kmp_gtid_mode, 2);
3704       }
3705     } else {
3706       if (TCR_4(__kmp_gtid_mode) != 1) {
3707         TCW_4(__kmp_gtid_mode, 1);
3708       }
3709     }
3710   }
3711 
3712 #ifdef KMP_ADJUST_BLOCKTIME
3713   /* Adjust blocktime to zero if necessary            */
3714   /* Middle initialization might not have occurred yet */
3715   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3716     if (__kmp_nth > __kmp_avail_proc) {
3717       __kmp_zero_bt = TRUE;
3718     }
3719   }
3720 #endif /* KMP_ADJUST_BLOCKTIME */
3721 
3722   /* setup this new hierarchy */
3723   if (!(root = __kmp_root[gtid])) {
3724     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3725     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3726   }
3727 
3728 #if KMP_STATS_ENABLED
3729   // Initialize stats as soon as possible (right after gtid assignment).
3730   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3731   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3732   KMP_SET_THREAD_STATE(SERIAL_REGION);
3733   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3734 #endif
3735   __kmp_initialize_root(root);
3736 
3737   /* setup new root thread structure */
3738   if (root->r.r_uber_thread) {
3739     root_thread = root->r.r_uber_thread;
3740   } else {
3741     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3742     if (__kmp_storage_map) {
3743       __kmp_print_thread_storage_map(root_thread, gtid);
3744     }
3745     root_thread->th.th_info.ds.ds_gtid = gtid;
3746 #if OMPT_SUPPORT
3747     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3748 #endif
3749     root_thread->th.th_root = root;
3750     if (__kmp_env_consistency_check) {
3751       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3752     }
3753 #if USE_FAST_MEMORY
3754     __kmp_initialize_fast_memory(root_thread);
3755 #endif /* USE_FAST_MEMORY */
3756 
3757 #if KMP_USE_BGET
3758     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3759     __kmp_initialize_bget(root_thread);
3760 #endif
3761     __kmp_init_random(root_thread); // Initialize random number generator
3762   }
3763 
3764   /* setup the serial team held in reserve by the root thread */
3765   if (!root_thread->th.th_serial_team) {
3766     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3767     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3768     root_thread->th.th_serial_team =
3769         __kmp_allocate_team(root, 1, 1,
3770 #if OMPT_SUPPORT
3771                             ompt_data_none, // root parallel id
3772 #endif
3773 #if OMP_40_ENABLED
3774                             proc_bind_default,
3775 #endif
3776                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3777   }
3778   KMP_ASSERT(root_thread->th.th_serial_team);
3779   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3780                 root_thread->th.th_serial_team));
3781 
3782   /* drop root_thread into place */
3783   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3784 
3785   root->r.r_root_team->t.t_threads[0] = root_thread;
3786   root->r.r_hot_team->t.t_threads[0] = root_thread;
3787   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3788   // AC: the team created in reserve, not for execution (it is unused for now).
3789   root_thread->th.th_serial_team->t.t_serialized = 0;
3790   root->r.r_uber_thread = root_thread;
3791 
3792   /* initialize the thread, get it ready to go */
3793   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3794   TCW_4(__kmp_init_gtid, TRUE);
3795 
3796   /* prepare the master thread for get_gtid() */
3797   __kmp_gtid_set_specific(gtid);
3798 
3799 #if USE_ITT_BUILD
3800   __kmp_itt_thread_name(gtid);
3801 #endif /* USE_ITT_BUILD */
3802 
3803 #ifdef KMP_TDATA_GTID
3804   __kmp_gtid = gtid;
3805 #endif
3806   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3807   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3808 
3809   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3810                 "plain=%u\n",
3811                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3812                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3813                 KMP_INIT_BARRIER_STATE));
3814   { // Initialize barrier data.
3815     int b;
3816     for (b = 0; b < bs_last_barrier; ++b) {
3817       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3818 #if USE_DEBUGGER
3819       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3820 #endif
3821     }
3822   }
3823   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3824                    KMP_INIT_BARRIER_STATE);
3825 
3826 #if KMP_AFFINITY_SUPPORTED
3827 #if OMP_40_ENABLED
3828   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3829   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3830   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3831   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3832 #endif
3833 
3834   if (TCR_4(__kmp_init_middle)) {
3835     __kmp_affinity_set_init_mask(gtid, TRUE);
3836   }
3837 #endif /* KMP_AFFINITY_SUPPORTED */
3838 
3839   __kmp_root_counter++;
3840 
3841 #if OMPT_SUPPORT
3842   if (!initial_thread && ompt_enabled.enabled) {
3843 
3844     ompt_thread_t *root_thread = ompt_get_thread();
3845 
3846     ompt_set_thread_state(root_thread, omp_state_overhead);
3847 
3848     if (ompt_enabled.ompt_callback_thread_begin) {
3849       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3850           ompt_thread_initial, __ompt_get_thread_data_internal());
3851     }
3852     ompt_data_t *task_data;
3853     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3854     if (ompt_enabled.ompt_callback_task_create) {
3855       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3856           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3857       // initial task has nothing to return to
3858     }
3859 
3860     ompt_set_thread_state(root_thread, omp_state_work_serial);
3861   }
3862 #endif
3863 
3864   KMP_MB();
3865   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3866 
3867   return gtid;
3868 }
3869 
3870 #if KMP_NESTED_HOT_TEAMS
3871 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3872                                 const int max_level) {
3873   int i, n, nth;
3874   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3875   if (!hot_teams || !hot_teams[level].hot_team) {
3876     return 0;
3877   }
3878   KMP_DEBUG_ASSERT(level < max_level);
3879   kmp_team_t *team = hot_teams[level].hot_team;
3880   nth = hot_teams[level].hot_team_nth;
3881   n = nth - 1; // master is not freed
3882   if (level < max_level - 1) {
3883     for (i = 0; i < nth; ++i) {
3884       kmp_info_t *th = team->t.t_threads[i];
3885       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3886       if (i > 0 && th->th.th_hot_teams) {
3887         __kmp_free(th->th.th_hot_teams);
3888         th->th.th_hot_teams = NULL;
3889       }
3890     }
3891   }
3892   __kmp_free_team(root, team, NULL);
3893   return n;
3894 }
3895 #endif
3896 
3897 // Resets a root thread and clear its root and hot teams.
3898 // Returns the number of __kmp_threads entries directly and indirectly freed.
3899 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3900   kmp_team_t *root_team = root->r.r_root_team;
3901   kmp_team_t *hot_team = root->r.r_hot_team;
3902   int n = hot_team->t.t_nproc;
3903   int i;
3904 
3905   KMP_DEBUG_ASSERT(!root->r.r_active);
3906 
3907   root->r.r_root_team = NULL;
3908   root->r.r_hot_team = NULL;
3909   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3910   // before call to __kmp_free_team().
3911   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3912 #if KMP_NESTED_HOT_TEAMS
3913   if (__kmp_hot_teams_max_level >
3914       0) { // need to free nested hot teams and their threads if any
3915     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3916       kmp_info_t *th = hot_team->t.t_threads[i];
3917       if (__kmp_hot_teams_max_level > 1) {
3918         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3919       }
3920       if (th->th.th_hot_teams) {
3921         __kmp_free(th->th.th_hot_teams);
3922         th->th.th_hot_teams = NULL;
3923       }
3924     }
3925   }
3926 #endif
3927   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3928 
3929   // Before we can reap the thread, we need to make certain that all other
3930   // threads in the teams that had this root as ancestor have stopped trying to
3931   // steal tasks.
3932   if (__kmp_tasking_mode != tskm_immediate_exec) {
3933     __kmp_wait_to_unref_task_teams();
3934   }
3935 
3936 #if KMP_OS_WINDOWS
3937   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3938   KA_TRACE(
3939       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3940            "\n",
3941            (LPVOID) & (root->r.r_uber_thread->th),
3942            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3943   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3944 #endif /* KMP_OS_WINDOWS */
3945 
3946 #if OMPT_SUPPORT
3947   if (ompt_enabled.ompt_callback_thread_end) {
3948     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3949         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3950   }
3951 #endif
3952 
3953   TCW_4(__kmp_nth,
3954         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3955   root->r.r_cg_nthreads--;
3956 
3957   __kmp_reap_thread(root->r.r_uber_thread, 1);
3958 
3959   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3960   // of freeing.
3961   root->r.r_uber_thread = NULL;
3962   /* mark root as no longer in use */
3963   root->r.r_begin = FALSE;
3964 
3965   return n;
3966 }
3967 
3968 void __kmp_unregister_root_current_thread(int gtid) {
3969   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3970   /* this lock should be ok, since unregister_root_current_thread is never
3971      called during an abort, only during a normal close. furthermore, if you
3972      have the forkjoin lock, you should never try to get the initz lock */
3973   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3974   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3975     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3976                   "exiting T#%d\n",
3977                   gtid));
3978     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3979     return;
3980   }
3981   kmp_root_t *root = __kmp_root[gtid];
3982 
3983   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3984   KMP_ASSERT(KMP_UBER_GTID(gtid));
3985   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3986   KMP_ASSERT(root->r.r_active == FALSE);
3987 
3988   KMP_MB();
3989 
3990 #if OMP_45_ENABLED
3991   kmp_info_t *thread = __kmp_threads[gtid];
3992   kmp_team_t *team = thread->th.th_team;
3993   kmp_task_team_t *task_team = thread->th.th_task_team;
3994 
3995   // we need to wait for the proxy tasks before finishing the thread
3996   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3997 #if OMPT_SUPPORT
3998     // the runtime is shutting down so we won't report any events
3999     thread->th.ompt_thread_info.state = omp_state_undefined;
4000 #endif
4001     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4002   }
4003 #endif
4004 
4005   __kmp_reset_root(gtid, root);
4006 
4007   /* free up this thread slot */
4008   __kmp_gtid_set_specific(KMP_GTID_DNE);
4009 #ifdef KMP_TDATA_GTID
4010   __kmp_gtid = KMP_GTID_DNE;
4011 #endif
4012 
4013   KMP_MB();
4014   KC_TRACE(10,
4015            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4016 
4017   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4018 }
4019 
4020 #if KMP_OS_WINDOWS
4021 /* __kmp_forkjoin_lock must be already held
4022    Unregisters a root thread that is not the current thread.  Returns the number
4023    of __kmp_threads entries freed as a result. */
4024 static int __kmp_unregister_root_other_thread(int gtid) {
4025   kmp_root_t *root = __kmp_root[gtid];
4026   int r;
4027 
4028   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4029   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4030   KMP_ASSERT(KMP_UBER_GTID(gtid));
4031   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4032   KMP_ASSERT(root->r.r_active == FALSE);
4033 
4034   r = __kmp_reset_root(gtid, root);
4035   KC_TRACE(10,
4036            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4037   return r;
4038 }
4039 #endif
4040 
4041 #if KMP_DEBUG
4042 void __kmp_task_info() {
4043 
4044   kmp_int32 gtid = __kmp_entry_gtid();
4045   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4046   kmp_info_t *this_thr = __kmp_threads[gtid];
4047   kmp_team_t *steam = this_thr->th.th_serial_team;
4048   kmp_team_t *team = this_thr->th.th_team;
4049 
4050   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4051                "ptask=%p\n",
4052                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4053                team->t.t_implicit_task_taskdata[tid].td_parent);
4054 }
4055 #endif // KMP_DEBUG
4056 
4057 /* TODO optimize with one big memclr, take out what isn't needed, split
4058    responsibility to workers as much as possible, and delay initialization of
4059    features as much as possible  */
4060 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4061                                   int tid, int gtid) {
4062   /* this_thr->th.th_info.ds.ds_gtid is setup in
4063      kmp_allocate_thread/create_worker.
4064      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4065   kmp_info_t *master = team->t.t_threads[0];
4066   KMP_DEBUG_ASSERT(this_thr != NULL);
4067   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4068   KMP_DEBUG_ASSERT(team);
4069   KMP_DEBUG_ASSERT(team->t.t_threads);
4070   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4071   KMP_DEBUG_ASSERT(master);
4072   KMP_DEBUG_ASSERT(master->th.th_root);
4073 
4074   KMP_MB();
4075 
4076   TCW_SYNC_PTR(this_thr->th.th_team, team);
4077 
4078   this_thr->th.th_info.ds.ds_tid = tid;
4079   this_thr->th.th_set_nproc = 0;
4080   if (__kmp_tasking_mode != tskm_immediate_exec)
4081     // When tasking is possible, threads are not safe to reap until they are
4082     // done tasking; this will be set when tasking code is exited in wait
4083     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4084   else // no tasking --> always safe to reap
4085     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4086 #if OMP_40_ENABLED
4087   this_thr->th.th_set_proc_bind = proc_bind_default;
4088 #if KMP_AFFINITY_SUPPORTED
4089   this_thr->th.th_new_place = this_thr->th.th_current_place;
4090 #endif
4091 #endif
4092   this_thr->th.th_root = master->th.th_root;
4093 
4094   /* setup the thread's cache of the team structure */
4095   this_thr->th.th_team_nproc = team->t.t_nproc;
4096   this_thr->th.th_team_master = master;
4097   this_thr->th.th_team_serialized = team->t.t_serialized;
4098   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4099 
4100   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4101 
4102   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4103                 tid, gtid, this_thr, this_thr->th.th_current_task));
4104 
4105   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4106                            team, tid, TRUE);
4107 
4108   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4109                 tid, gtid, this_thr, this_thr->th.th_current_task));
4110   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4111   // __kmp_initialize_team()?
4112 
4113   /* TODO no worksharing in speculative threads */
4114   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4115 
4116   this_thr->th.th_local.this_construct = 0;
4117 
4118   if (!this_thr->th.th_pri_common) {
4119     this_thr->th.th_pri_common =
4120         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4121     if (__kmp_storage_map) {
4122       __kmp_print_storage_map_gtid(
4123           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4124           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4125     }
4126     this_thr->th.th_pri_head = NULL;
4127   }
4128 
4129   /* Initialize dynamic dispatch */
4130   {
4131     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4132     // Use team max_nproc since this will never change for the team.
4133     size_t disp_size =
4134         sizeof(dispatch_private_info_t) *
4135         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4136     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4137                   team->t.t_max_nproc));
4138     KMP_ASSERT(dispatch);
4139     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4140     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4141 
4142     dispatch->th_disp_index = 0;
4143 #if OMP_45_ENABLED
4144     dispatch->th_doacross_buf_idx = 0;
4145 #endif
4146     if (!dispatch->th_disp_buffer) {
4147       dispatch->th_disp_buffer =
4148           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4149 
4150       if (__kmp_storage_map) {
4151         __kmp_print_storage_map_gtid(
4152             gtid, &dispatch->th_disp_buffer[0],
4153             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4154                                           ? 1
4155                                           : __kmp_dispatch_num_buffers],
4156             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4157                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4158             gtid, team->t.t_id, gtid);
4159       }
4160     } else {
4161       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4162     }
4163 
4164     dispatch->th_dispatch_pr_current = 0;
4165     dispatch->th_dispatch_sh_current = 0;
4166 
4167     dispatch->th_deo_fcn = 0; /* ORDERED     */
4168     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4169   }
4170 
4171   this_thr->th.th_next_pool = NULL;
4172 
4173   if (!this_thr->th.th_task_state_memo_stack) {
4174     size_t i;
4175     this_thr->th.th_task_state_memo_stack =
4176         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4177     this_thr->th.th_task_state_top = 0;
4178     this_thr->th.th_task_state_stack_sz = 4;
4179     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4180          ++i) // zero init the stack
4181       this_thr->th.th_task_state_memo_stack[i] = 0;
4182   }
4183 
4184   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4185   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4186 
4187   KMP_MB();
4188 }
4189 
4190 /* allocate a new thread for the requesting team. this is only called from
4191    within a forkjoin critical section. we will first try to get an available
4192    thread from the thread pool. if none is available, we will fork a new one
4193    assuming we are able to create a new one. this should be assured, as the
4194    caller should check on this first. */
4195 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4196                                   int new_tid) {
4197   kmp_team_t *serial_team;
4198   kmp_info_t *new_thr;
4199   int new_gtid;
4200 
4201   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4202   KMP_DEBUG_ASSERT(root && team);
4203 #if !KMP_NESTED_HOT_TEAMS
4204   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4205 #endif
4206   KMP_MB();
4207 
4208   /* first, try to get one from the thread pool */
4209   if (__kmp_thread_pool) {
4210 
4211     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4212     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4213     if (new_thr == __kmp_thread_pool_insert_pt) {
4214       __kmp_thread_pool_insert_pt = NULL;
4215     }
4216     TCW_4(new_thr->th.th_in_pool, FALSE);
4217     // Don't touch th_active_in_pool or th_active.
4218     // The worker thread adjusts those flags as it sleeps/awakens.
4219     __kmp_thread_pool_nth--;
4220 
4221     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4222                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4223     KMP_ASSERT(!new_thr->th.th_team);
4224     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4225     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4226 
4227     /* setup the thread structure */
4228     __kmp_initialize_info(new_thr, team, new_tid,
4229                           new_thr->th.th_info.ds.ds_gtid);
4230     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4231 
4232     TCW_4(__kmp_nth, __kmp_nth + 1);
4233     root->r.r_cg_nthreads++;
4234 
4235     new_thr->th.th_task_state = 0;
4236     new_thr->th.th_task_state_top = 0;
4237     new_thr->th.th_task_state_stack_sz = 4;
4238 
4239 #ifdef KMP_ADJUST_BLOCKTIME
4240     /* Adjust blocktime back to zero if necessary */
4241     /* Middle initialization might not have occurred yet */
4242     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4243       if (__kmp_nth > __kmp_avail_proc) {
4244         __kmp_zero_bt = TRUE;
4245       }
4246     }
4247 #endif /* KMP_ADJUST_BLOCKTIME */
4248 
4249 #if KMP_DEBUG
4250     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4251     // KMP_BARRIER_PARENT_FLAG.
4252     int b;
4253     kmp_balign_t *balign = new_thr->th.th_bar;
4254     for (b = 0; b < bs_last_barrier; ++b)
4255       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4256 #endif
4257 
4258     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4259                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4260 
4261     KMP_MB();
4262     return new_thr;
4263   }
4264 
4265   /* no, well fork a new one */
4266   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4267   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4268 
4269 #if KMP_USE_MONITOR
4270   // If this is the first worker thread the RTL is creating, then also
4271   // launch the monitor thread.  We try to do this as early as possible.
4272   if (!TCR_4(__kmp_init_monitor)) {
4273     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4274     if (!TCR_4(__kmp_init_monitor)) {
4275       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4276       TCW_4(__kmp_init_monitor, 1);
4277       __kmp_create_monitor(&__kmp_monitor);
4278       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4279 #if KMP_OS_WINDOWS
4280       // AC: wait until monitor has started. This is a fix for CQ232808.
4281       // The reason is that if the library is loaded/unloaded in a loop with
4282       // small (parallel) work in between, then there is high probability that
4283       // monitor thread started after the library shutdown. At shutdown it is
4284       // too late to cope with the problem, because when the master is in
4285       // DllMain (process detach) the monitor has no chances to start (it is
4286       // blocked), and master has no means to inform the monitor that the
4287       // library has gone, because all the memory which the monitor can access
4288       // is going to be released/reset.
4289       while (TCR_4(__kmp_init_monitor) < 2) {
4290         KMP_YIELD(TRUE);
4291       }
4292       KF_TRACE(10, ("after monitor thread has started\n"));
4293 #endif
4294     }
4295     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4296   }
4297 #endif
4298 
4299   KMP_MB();
4300   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4301     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4302   }
4303 
4304   /* allocate space for it. */
4305   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4306 
4307   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4308 
4309   if (__kmp_storage_map) {
4310     __kmp_print_thread_storage_map(new_thr, new_gtid);
4311   }
4312 
4313   // add the reserve serialized team, initialized from the team's master thread
4314   {
4315     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4316     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4317     new_thr->th.th_serial_team = serial_team =
4318         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4319 #if OMPT_SUPPORT
4320                                           ompt_data_none, // root parallel id
4321 #endif
4322 #if OMP_40_ENABLED
4323                                           proc_bind_default,
4324 #endif
4325                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4326   }
4327   KMP_ASSERT(serial_team);
4328   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4329   // execution (it is unused for now).
4330   serial_team->t.t_threads[0] = new_thr;
4331   KF_TRACE(10,
4332            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4333             new_thr));
4334 
4335   /* setup the thread structures */
4336   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4337 
4338 #if USE_FAST_MEMORY
4339   __kmp_initialize_fast_memory(new_thr);
4340 #endif /* USE_FAST_MEMORY */
4341 
4342 #if KMP_USE_BGET
4343   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4344   __kmp_initialize_bget(new_thr);
4345 #endif
4346 
4347   __kmp_init_random(new_thr); // Initialize random number generator
4348 
4349   /* Initialize these only once when thread is grabbed for a team allocation */
4350   KA_TRACE(20,
4351            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4352             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4353 
4354   int b;
4355   kmp_balign_t *balign = new_thr->th.th_bar;
4356   for (b = 0; b < bs_last_barrier; ++b) {
4357     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4358     balign[b].bb.team = NULL;
4359     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4360     balign[b].bb.use_oncore_barrier = 0;
4361   }
4362 
4363   new_thr->th.th_spin_here = FALSE;
4364   new_thr->th.th_next_waiting = 0;
4365 
4366 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4367   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4368   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4369   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4370   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4371 #endif
4372 
4373   TCW_4(new_thr->th.th_in_pool, FALSE);
4374   new_thr->th.th_active_in_pool = FALSE;
4375   TCW_4(new_thr->th.th_active, TRUE);
4376 
4377   /* adjust the global counters */
4378   __kmp_all_nth++;
4379   __kmp_nth++;
4380 
4381   root->r.r_cg_nthreads++;
4382 
4383   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4384   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4385   if (__kmp_adjust_gtid_mode) {
4386     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4387       if (TCR_4(__kmp_gtid_mode) != 2) {
4388         TCW_4(__kmp_gtid_mode, 2);
4389       }
4390     } else {
4391       if (TCR_4(__kmp_gtid_mode) != 1) {
4392         TCW_4(__kmp_gtid_mode, 1);
4393       }
4394     }
4395   }
4396 
4397 #ifdef KMP_ADJUST_BLOCKTIME
4398   /* Adjust blocktime back to zero if necessary       */
4399   /* Middle initialization might not have occurred yet */
4400   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4401     if (__kmp_nth > __kmp_avail_proc) {
4402       __kmp_zero_bt = TRUE;
4403     }
4404   }
4405 #endif /* KMP_ADJUST_BLOCKTIME */
4406 
4407   /* actually fork it and create the new worker thread */
4408   KF_TRACE(
4409       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4410   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4411   KF_TRACE(10,
4412            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4413 
4414   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4415                 new_gtid));
4416   KMP_MB();
4417   return new_thr;
4418 }
4419 
4420 /* Reinitialize team for reuse.
4421    The hot team code calls this case at every fork barrier, so EPCC barrier
4422    test are extremely sensitive to changes in it, esp. writes to the team
4423    struct, which cause a cache invalidation in all threads.
4424    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4425 static void __kmp_reinitialize_team(kmp_team_t *team,
4426                                     kmp_internal_control_t *new_icvs,
4427                                     ident_t *loc) {
4428   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4429                 team->t.t_threads[0], team));
4430   KMP_DEBUG_ASSERT(team && new_icvs);
4431   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4432   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4433 
4434   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4435   // Copy ICVs to the master thread's implicit taskdata
4436   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4437   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4438 
4439   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4440                 team->t.t_threads[0], team));
4441 }
4442 
4443 /* Initialize the team data structure.
4444    This assumes the t_threads and t_max_nproc are already set.
4445    Also, we don't touch the arguments */
4446 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4447                                   kmp_internal_control_t *new_icvs,
4448                                   ident_t *loc) {
4449   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4450 
4451   /* verify */
4452   KMP_DEBUG_ASSERT(team);
4453   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4454   KMP_DEBUG_ASSERT(team->t.t_threads);
4455   KMP_MB();
4456 
4457   team->t.t_master_tid = 0; /* not needed */
4458   /* team->t.t_master_bar;        not needed */
4459   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4460   team->t.t_nproc = new_nproc;
4461 
4462   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4463   team->t.t_next_pool = NULL;
4464   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4465    * up hot team */
4466 
4467   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4468   team->t.t_invoke = NULL; /* not needed */
4469 
4470   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4471   team->t.t_sched.sched = new_icvs->sched.sched;
4472 
4473 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4474   team->t.t_fp_control_saved = FALSE; /* not needed */
4475   team->t.t_x87_fpu_control_word = 0; /* not needed */
4476   team->t.t_mxcsr = 0; /* not needed */
4477 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4478 
4479   team->t.t_construct = 0;
4480 
4481   team->t.t_ordered.dt.t_value = 0;
4482   team->t.t_master_active = FALSE;
4483 
4484   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4485 
4486 #ifdef KMP_DEBUG
4487   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4488 #endif
4489   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4490 
4491   team->t.t_control_stack_top = NULL;
4492 
4493   __kmp_reinitialize_team(team, new_icvs, loc);
4494 
4495   KMP_MB();
4496   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4497 }
4498 
4499 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4500 /* Sets full mask for thread and returns old mask, no changes to structures. */
4501 static void
4502 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4503   if (KMP_AFFINITY_CAPABLE()) {
4504     int status;
4505     if (old_mask != NULL) {
4506       status = __kmp_get_system_affinity(old_mask, TRUE);
4507       int error = errno;
4508       if (status != 0) {
4509         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4510                     __kmp_msg_null);
4511       }
4512     }
4513     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4514   }
4515 }
4516 #endif
4517 
4518 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4519 
4520 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4521 // It calculats the worker + master thread's partition based upon the parent
4522 // thread's partition, and binds each worker to a thread in their partition.
4523 // The master thread's partition should already include its current binding.
4524 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4525   // Copy the master thread's place partion to the team struct
4526   kmp_info_t *master_th = team->t.t_threads[0];
4527   KMP_DEBUG_ASSERT(master_th != NULL);
4528   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4529   int first_place = master_th->th.th_first_place;
4530   int last_place = master_th->th.th_last_place;
4531   int masters_place = master_th->th.th_current_place;
4532   team->t.t_first_place = first_place;
4533   team->t.t_last_place = last_place;
4534 
4535   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4536                 "bound to place %d partition = [%d,%d]\n",
4537                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4538                 team->t.t_id, masters_place, first_place, last_place));
4539 
4540   switch (proc_bind) {
4541 
4542   case proc_bind_default:
4543     // serial teams might have the proc_bind policy set to proc_bind_default. It
4544     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4545     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4546     break;
4547 
4548   case proc_bind_master: {
4549     int f;
4550     int n_th = team->t.t_nproc;
4551     for (f = 1; f < n_th; f++) {
4552       kmp_info_t *th = team->t.t_threads[f];
4553       KMP_DEBUG_ASSERT(th != NULL);
4554       th->th.th_first_place = first_place;
4555       th->th.th_last_place = last_place;
4556       th->th.th_new_place = masters_place;
4557 
4558       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4559                      "partition = [%d,%d]\n",
4560                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4561                      f, masters_place, first_place, last_place));
4562     }
4563   } break;
4564 
4565   case proc_bind_close: {
4566     int f;
4567     int n_th = team->t.t_nproc;
4568     int n_places;
4569     if (first_place <= last_place) {
4570       n_places = last_place - first_place + 1;
4571     } else {
4572       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4573     }
4574     if (n_th <= n_places) {
4575       int place = masters_place;
4576       for (f = 1; f < n_th; f++) {
4577         kmp_info_t *th = team->t.t_threads[f];
4578         KMP_DEBUG_ASSERT(th != NULL);
4579 
4580         if (place == last_place) {
4581           place = first_place;
4582         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4583           place = 0;
4584         } else {
4585           place++;
4586         }
4587         th->th.th_first_place = first_place;
4588         th->th.th_last_place = last_place;
4589         th->th.th_new_place = place;
4590 
4591         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4592                        "partition = [%d,%d]\n",
4593                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4594                        team->t.t_id, f, place, first_place, last_place));
4595       }
4596     } else {
4597       int S, rem, gap, s_count;
4598       S = n_th / n_places;
4599       s_count = 0;
4600       rem = n_th - (S * n_places);
4601       gap = rem > 0 ? n_places / rem : n_places;
4602       int place = masters_place;
4603       int gap_ct = gap;
4604       for (f = 0; f < n_th; f++) {
4605         kmp_info_t *th = team->t.t_threads[f];
4606         KMP_DEBUG_ASSERT(th != NULL);
4607 
4608         th->th.th_first_place = first_place;
4609         th->th.th_last_place = last_place;
4610         th->th.th_new_place = place;
4611         s_count++;
4612 
4613         if ((s_count == S) && rem && (gap_ct == gap)) {
4614           // do nothing, add an extra thread to place on next iteration
4615         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4616           // we added an extra thread to this place; move to next place
4617           if (place == last_place) {
4618             place = first_place;
4619           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4620             place = 0;
4621           } else {
4622             place++;
4623           }
4624           s_count = 0;
4625           gap_ct = 1;
4626           rem--;
4627         } else if (s_count == S) { // place full; don't add extra
4628           if (place == last_place) {
4629             place = first_place;
4630           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4631             place = 0;
4632           } else {
4633             place++;
4634           }
4635           gap_ct++;
4636           s_count = 0;
4637         }
4638 
4639         KA_TRACE(100,
4640                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4641                   "partition = [%d,%d]\n",
4642                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4643                   th->th.th_new_place, first_place, last_place));
4644       }
4645       KMP_DEBUG_ASSERT(place == masters_place);
4646     }
4647   } break;
4648 
4649   case proc_bind_spread: {
4650     int f;
4651     int n_th = team->t.t_nproc;
4652     int n_places;
4653     int thidx;
4654     if (first_place <= last_place) {
4655       n_places = last_place - first_place + 1;
4656     } else {
4657       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4658     }
4659     if (n_th <= n_places) {
4660       int place = -1;
4661 
4662       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4663         int S = n_places / n_th;
4664         int s_count, rem, gap, gap_ct;
4665 
4666         place = masters_place;
4667         rem = n_places - n_th * S;
4668         gap = rem ? n_th / rem : 1;
4669         gap_ct = gap;
4670         thidx = n_th;
4671         if (update_master_only == 1)
4672           thidx = 1;
4673         for (f = 0; f < thidx; f++) {
4674           kmp_info_t *th = team->t.t_threads[f];
4675           KMP_DEBUG_ASSERT(th != NULL);
4676 
4677           th->th.th_first_place = place;
4678           th->th.th_new_place = place;
4679           s_count = 1;
4680           while (s_count < S) {
4681             if (place == last_place) {
4682               place = first_place;
4683             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4684               place = 0;
4685             } else {
4686               place++;
4687             }
4688             s_count++;
4689           }
4690           if (rem && (gap_ct == gap)) {
4691             if (place == last_place) {
4692               place = first_place;
4693             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4694               place = 0;
4695             } else {
4696               place++;
4697             }
4698             rem--;
4699             gap_ct = 0;
4700           }
4701           th->th.th_last_place = place;
4702           gap_ct++;
4703 
4704           if (place == last_place) {
4705             place = first_place;
4706           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4707             place = 0;
4708           } else {
4709             place++;
4710           }
4711 
4712           KA_TRACE(100,
4713                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4714                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4715                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4716                     f, th->th.th_new_place, th->th.th_first_place,
4717                     th->th.th_last_place, __kmp_affinity_num_masks));
4718         }
4719       } else {
4720         /* Having uniform space of available computation places I can create
4721            T partitions of round(P/T) size and put threads into the first
4722            place of each partition. */
4723         double current = static_cast<double>(masters_place);
4724         double spacing =
4725             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4726         int first, last;
4727         kmp_info_t *th;
4728 
4729         thidx = n_th + 1;
4730         if (update_master_only == 1)
4731           thidx = 1;
4732         for (f = 0; f < thidx; f++) {
4733           first = static_cast<int>(current);
4734           last = static_cast<int>(current + spacing) - 1;
4735           KMP_DEBUG_ASSERT(last >= first);
4736           if (first >= n_places) {
4737             if (masters_place) {
4738               first -= n_places;
4739               last -= n_places;
4740               if (first == (masters_place + 1)) {
4741                 KMP_DEBUG_ASSERT(f == n_th);
4742                 first--;
4743               }
4744               if (last == masters_place) {
4745                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4746                 last--;
4747               }
4748             } else {
4749               KMP_DEBUG_ASSERT(f == n_th);
4750               first = 0;
4751               last = 0;
4752             }
4753           }
4754           if (last >= n_places) {
4755             last = (n_places - 1);
4756           }
4757           place = first;
4758           current += spacing;
4759           if (f < n_th) {
4760             KMP_DEBUG_ASSERT(0 <= first);
4761             KMP_DEBUG_ASSERT(n_places > first);
4762             KMP_DEBUG_ASSERT(0 <= last);
4763             KMP_DEBUG_ASSERT(n_places > last);
4764             KMP_DEBUG_ASSERT(last_place >= first_place);
4765             th = team->t.t_threads[f];
4766             KMP_DEBUG_ASSERT(th);
4767             th->th.th_first_place = first;
4768             th->th.th_new_place = place;
4769             th->th.th_last_place = last;
4770 
4771             KA_TRACE(100,
4772                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4773                       "partition = [%d,%d], spacing = %.4f\n",
4774                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4775                       team->t.t_id, f, th->th.th_new_place,
4776                       th->th.th_first_place, th->th.th_last_place, spacing));
4777           }
4778         }
4779       }
4780       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4781     } else {
4782       int S, rem, gap, s_count;
4783       S = n_th / n_places;
4784       s_count = 0;
4785       rem = n_th - (S * n_places);
4786       gap = rem > 0 ? n_places / rem : n_places;
4787       int place = masters_place;
4788       int gap_ct = gap;
4789       thidx = n_th;
4790       if (update_master_only == 1)
4791         thidx = 1;
4792       for (f = 0; f < thidx; f++) {
4793         kmp_info_t *th = team->t.t_threads[f];
4794         KMP_DEBUG_ASSERT(th != NULL);
4795 
4796         th->th.th_first_place = place;
4797         th->th.th_last_place = place;
4798         th->th.th_new_place = place;
4799         s_count++;
4800 
4801         if ((s_count == S) && rem && (gap_ct == gap)) {
4802           // do nothing, add an extra thread to place on next iteration
4803         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4804           // we added an extra thread to this place; move on to next place
4805           if (place == last_place) {
4806             place = first_place;
4807           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4808             place = 0;
4809           } else {
4810             place++;
4811           }
4812           s_count = 0;
4813           gap_ct = 1;
4814           rem--;
4815         } else if (s_count == S) { // place is full; don't add extra thread
4816           if (place == last_place) {
4817             place = first_place;
4818           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4819             place = 0;
4820           } else {
4821             place++;
4822           }
4823           gap_ct++;
4824           s_count = 0;
4825         }
4826 
4827         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828                        "partition = [%d,%d]\n",
4829                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4830                        team->t.t_id, f, th->th.th_new_place,
4831                        th->th.th_first_place, th->th.th_last_place));
4832       }
4833       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4834     }
4835   } break;
4836 
4837   default:
4838     break;
4839   }
4840 
4841   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4842 }
4843 
4844 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4845 
4846 /* allocate a new team data structure to use.  take one off of the free pool if
4847    available */
4848 kmp_team_t *
4849 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4850 #if OMPT_SUPPORT
4851                     ompt_data_t ompt_parallel_data,
4852 #endif
4853 #if OMP_40_ENABLED
4854                     kmp_proc_bind_t new_proc_bind,
4855 #endif
4856                     kmp_internal_control_t *new_icvs,
4857                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4858   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4859   int f;
4860   kmp_team_t *team;
4861   int use_hot_team = !root->r.r_active;
4862   int level = 0;
4863 
4864   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4865   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4866   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4867   KMP_MB();
4868 
4869 #if KMP_NESTED_HOT_TEAMS
4870   kmp_hot_team_ptr_t *hot_teams;
4871   if (master) {
4872     team = master->th.th_team;
4873     level = team->t.t_active_level;
4874     if (master->th.th_teams_microtask) { // in teams construct?
4875       if (master->th.th_teams_size.nteams > 1 &&
4876           ( // #teams > 1
4877               team->t.t_pkfn ==
4878                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4879               master->th.th_teams_level <
4880                   team->t.t_level)) { // or nested parallel inside the teams
4881         ++level; // not increment if #teams==1, or for outer fork of the teams;
4882         // increment otherwise
4883       }
4884     }
4885     hot_teams = master->th.th_hot_teams;
4886     if (level < __kmp_hot_teams_max_level && hot_teams &&
4887         hot_teams[level]
4888             .hot_team) { // hot team has already been allocated for given level
4889       use_hot_team = 1;
4890     } else {
4891       use_hot_team = 0;
4892     }
4893   }
4894 #endif
4895   // Optimization to use a "hot" team
4896   if (use_hot_team && new_nproc > 1) {
4897     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4898 #if KMP_NESTED_HOT_TEAMS
4899     team = hot_teams[level].hot_team;
4900 #else
4901     team = root->r.r_hot_team;
4902 #endif
4903 #if KMP_DEBUG
4904     if (__kmp_tasking_mode != tskm_immediate_exec) {
4905       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4906                     "task_team[1] = %p before reinit\n",
4907                     team->t.t_task_team[0], team->t.t_task_team[1]));
4908     }
4909 #endif
4910 
4911     // Has the number of threads changed?
4912     /* Let's assume the most common case is that the number of threads is
4913        unchanged, and put that case first. */
4914     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4915       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4916       // This case can mean that omp_set_num_threads() was called and the hot
4917       // team size was already reduced, so we check the special flag
4918       if (team->t.t_size_changed == -1) {
4919         team->t.t_size_changed = 1;
4920       } else {
4921         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4922       }
4923 
4924       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4925       kmp_r_sched_t new_sched = new_icvs->sched;
4926       // set master's schedule as new run-time schedule
4927       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4928 
4929       __kmp_reinitialize_team(team, new_icvs,
4930                               root->r.r_uber_thread->th.th_ident);
4931 
4932       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4933                     team->t.t_threads[0], team));
4934       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4935 
4936 #if OMP_40_ENABLED
4937 #if KMP_AFFINITY_SUPPORTED
4938       if ((team->t.t_size_changed == 0) &&
4939           (team->t.t_proc_bind == new_proc_bind)) {
4940         if (new_proc_bind == proc_bind_spread) {
4941           __kmp_partition_places(
4942               team, 1); // add flag to update only master for spread
4943         }
4944         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4945                        "proc_bind = %d, partition = [%d,%d]\n",
4946                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4947                        team->t.t_last_place));
4948       } else {
4949         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4950         __kmp_partition_places(team);
4951       }
4952 #else
4953       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4954 #endif /* KMP_AFFINITY_SUPPORTED */
4955 #endif /* OMP_40_ENABLED */
4956     } else if (team->t.t_nproc > new_nproc) {
4957       KA_TRACE(20,
4958                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4959                 new_nproc));
4960 
4961       team->t.t_size_changed = 1;
4962 #if KMP_NESTED_HOT_TEAMS
4963       if (__kmp_hot_teams_mode == 0) {
4964         // AC: saved number of threads should correspond to team's value in this
4965         // mode, can be bigger in mode 1, when hot team has threads in reserve
4966         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4967         hot_teams[level].hot_team_nth = new_nproc;
4968 #endif // KMP_NESTED_HOT_TEAMS
4969         /* release the extra threads we don't need any more */
4970         for (f = new_nproc; f < team->t.t_nproc; f++) {
4971           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4972           if (__kmp_tasking_mode != tskm_immediate_exec) {
4973             // When decreasing team size, threads no longer in the team should
4974             // unref task team.
4975             team->t.t_threads[f]->th.th_task_team = NULL;
4976           }
4977           __kmp_free_thread(team->t.t_threads[f]);
4978           team->t.t_threads[f] = NULL;
4979         }
4980 #if KMP_NESTED_HOT_TEAMS
4981       } // (__kmp_hot_teams_mode == 0)
4982       else {
4983         // When keeping extra threads in team, switch threads to wait on own
4984         // b_go flag
4985         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4986           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4987           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4988           for (int b = 0; b < bs_last_barrier; ++b) {
4989             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4990               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4991             }
4992             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4993           }
4994         }
4995       }
4996 #endif // KMP_NESTED_HOT_TEAMS
4997       team->t.t_nproc = new_nproc;
4998       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4999       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5000       __kmp_reinitialize_team(team, new_icvs,
5001                               root->r.r_uber_thread->th.th_ident);
5002 
5003       /* update the remaining threads */
5004       for (f = 0; f < new_nproc; ++f) {
5005         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5006       }
5007       // restore the current task state of the master thread: should be the
5008       // implicit task
5009       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5010                     team->t.t_threads[0], team));
5011 
5012       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5013 
5014 #ifdef KMP_DEBUG
5015       for (f = 0; f < team->t.t_nproc; f++) {
5016         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5017                          team->t.t_threads[f]->th.th_team_nproc ==
5018                              team->t.t_nproc);
5019       }
5020 #endif
5021 
5022 #if OMP_40_ENABLED
5023       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5024 #if KMP_AFFINITY_SUPPORTED
5025       __kmp_partition_places(team);
5026 #endif
5027 #endif
5028     } else { // team->t.t_nproc < new_nproc
5029 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5030       kmp_affin_mask_t *old_mask;
5031       if (KMP_AFFINITY_CAPABLE()) {
5032         KMP_CPU_ALLOC(old_mask);
5033       }
5034 #endif
5035 
5036       KA_TRACE(20,
5037                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5038                 new_nproc));
5039 
5040       team->t.t_size_changed = 1;
5041 
5042 #if KMP_NESTED_HOT_TEAMS
5043       int avail_threads = hot_teams[level].hot_team_nth;
5044       if (new_nproc < avail_threads)
5045         avail_threads = new_nproc;
5046       kmp_info_t **other_threads = team->t.t_threads;
5047       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5048         // Adjust barrier data of reserved threads (if any) of the team
5049         // Other data will be set in __kmp_initialize_info() below.
5050         int b;
5051         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5052         for (b = 0; b < bs_last_barrier; ++b) {
5053           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5054           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5055 #if USE_DEBUGGER
5056           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5057 #endif
5058         }
5059       }
5060       if (hot_teams[level].hot_team_nth >= new_nproc) {
5061         // we have all needed threads in reserve, no need to allocate any
5062         // this only possible in mode 1, cannot have reserved threads in mode 0
5063         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5064         team->t.t_nproc = new_nproc; // just get reserved threads involved
5065       } else {
5066         // we may have some threads in reserve, but not enough
5067         team->t.t_nproc =
5068             hot_teams[level]
5069                 .hot_team_nth; // get reserved threads involved if any
5070         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5071 #endif // KMP_NESTED_HOT_TEAMS
5072         if (team->t.t_max_nproc < new_nproc) {
5073           /* reallocate larger arrays */
5074           __kmp_reallocate_team_arrays(team, new_nproc);
5075           __kmp_reinitialize_team(team, new_icvs, NULL);
5076         }
5077 
5078 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5079         /* Temporarily set full mask for master thread before creation of
5080            workers. The reason is that workers inherit the affinity from master,
5081            so if a lot of workers are created on the single core quickly, they
5082            don't get a chance to set their own affinity for a long time. */
5083         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5084 #endif
5085 
5086         /* allocate new threads for the hot team */
5087         for (f = team->t.t_nproc; f < new_nproc; f++) {
5088           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5089           KMP_DEBUG_ASSERT(new_worker);
5090           team->t.t_threads[f] = new_worker;
5091 
5092           KA_TRACE(20,
5093                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5094                     "join=%llu, plain=%llu\n",
5095                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5096                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5097                     team->t.t_bar[bs_plain_barrier].b_arrived));
5098 
5099           { // Initialize barrier data for new threads.
5100             int b;
5101             kmp_balign_t *balign = new_worker->th.th_bar;
5102             for (b = 0; b < bs_last_barrier; ++b) {
5103               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5104               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5105                                KMP_BARRIER_PARENT_FLAG);
5106 #if USE_DEBUGGER
5107               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5108 #endif
5109             }
5110           }
5111         }
5112 
5113 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5114         if (KMP_AFFINITY_CAPABLE()) {
5115           /* Restore initial master thread's affinity mask */
5116           __kmp_set_system_affinity(old_mask, TRUE);
5117           KMP_CPU_FREE(old_mask);
5118         }
5119 #endif
5120 #if KMP_NESTED_HOT_TEAMS
5121       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5122 #endif // KMP_NESTED_HOT_TEAMS
5123       /* make sure everyone is syncronized */
5124       int old_nproc = team->t.t_nproc; // save old value and use to update only
5125       // new threads below
5126       __kmp_initialize_team(team, new_nproc, new_icvs,
5127                             root->r.r_uber_thread->th.th_ident);
5128 
5129       /* reinitialize the threads */
5130       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5131       for (f = 0; f < team->t.t_nproc; ++f)
5132         __kmp_initialize_info(team->t.t_threads[f], team, f,
5133                               __kmp_gtid_from_tid(f, team));
5134       if (level) { // set th_task_state for new threads in nested hot team
5135         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5136         // only need to set the th_task_state for the new threads. th_task_state
5137         // for master thread will not be accurate until after this in
5138         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5139         // correct value.
5140         for (f = old_nproc; f < team->t.t_nproc; ++f)
5141           team->t.t_threads[f]->th.th_task_state =
5142               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5143       } else { // set th_task_state for new threads in non-nested hot team
5144         int old_state =
5145             team->t.t_threads[0]->th.th_task_state; // copy master's state
5146         for (f = old_nproc; f < team->t.t_nproc; ++f)
5147           team->t.t_threads[f]->th.th_task_state = old_state;
5148       }
5149 
5150 #ifdef KMP_DEBUG
5151       for (f = 0; f < team->t.t_nproc; ++f) {
5152         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5153                          team->t.t_threads[f]->th.th_team_nproc ==
5154                              team->t.t_nproc);
5155       }
5156 #endif
5157 
5158 #if OMP_40_ENABLED
5159       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5160 #if KMP_AFFINITY_SUPPORTED
5161       __kmp_partition_places(team);
5162 #endif
5163 #endif
5164     } // Check changes in number of threads
5165 
5166 #if OMP_40_ENABLED
5167     kmp_info_t *master = team->t.t_threads[0];
5168     if (master->th.th_teams_microtask) {
5169       for (f = 1; f < new_nproc; ++f) {
5170         // propagate teams construct specific info to workers
5171         kmp_info_t *thr = team->t.t_threads[f];
5172         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5173         thr->th.th_teams_level = master->th.th_teams_level;
5174         thr->th.th_teams_size = master->th.th_teams_size;
5175       }
5176     }
5177 #endif /* OMP_40_ENABLED */
5178 #if KMP_NESTED_HOT_TEAMS
5179     if (level) {
5180       // Sync barrier state for nested hot teams, not needed for outermost hot
5181       // team.
5182       for (f = 1; f < new_nproc; ++f) {
5183         kmp_info_t *thr = team->t.t_threads[f];
5184         int b;
5185         kmp_balign_t *balign = thr->th.th_bar;
5186         for (b = 0; b < bs_last_barrier; ++b) {
5187           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5188           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5189 #if USE_DEBUGGER
5190           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5191 #endif
5192         }
5193       }
5194     }
5195 #endif // KMP_NESTED_HOT_TEAMS
5196 
5197     /* reallocate space for arguments if necessary */
5198     __kmp_alloc_argv_entries(argc, team, TRUE);
5199     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5200     // The hot team re-uses the previous task team,
5201     // if untouched during the previous release->gather phase.
5202 
5203     KF_TRACE(10, (" hot_team = %p\n", team));
5204 
5205 #if KMP_DEBUG
5206     if (__kmp_tasking_mode != tskm_immediate_exec) {
5207       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5208                     "task_team[1] = %p after reinit\n",
5209                     team->t.t_task_team[0], team->t.t_task_team[1]));
5210     }
5211 #endif
5212 
5213 #if OMPT_SUPPORT
5214     __ompt_team_assign_id(team, ompt_parallel_data);
5215 #endif
5216 
5217     KMP_MB();
5218 
5219     return team;
5220   }
5221 
5222   /* next, let's try to take one from the team pool */
5223   KMP_MB();
5224   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5225     /* TODO: consider resizing undersized teams instead of reaping them, now
5226        that we have a resizing mechanism */
5227     if (team->t.t_max_nproc >= max_nproc) {
5228       /* take this team from the team pool */
5229       __kmp_team_pool = team->t.t_next_pool;
5230 
5231       /* setup the team for fresh use */
5232       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5233 
5234       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5235                     "task_team[1] %p to NULL\n",
5236                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5237       team->t.t_task_team[0] = NULL;
5238       team->t.t_task_team[1] = NULL;
5239 
5240       /* reallocate space for arguments if necessary */
5241       __kmp_alloc_argv_entries(argc, team, TRUE);
5242       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5243 
5244       KA_TRACE(
5245           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5246                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5247       { // Initialize barrier data.
5248         int b;
5249         for (b = 0; b < bs_last_barrier; ++b) {
5250           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5251 #if USE_DEBUGGER
5252           team->t.t_bar[b].b_master_arrived = 0;
5253           team->t.t_bar[b].b_team_arrived = 0;
5254 #endif
5255         }
5256       }
5257 
5258 #if OMP_40_ENABLED
5259       team->t.t_proc_bind = new_proc_bind;
5260 #endif
5261 
5262       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5263                     team->t.t_id));
5264 
5265 #if OMPT_SUPPORT
5266       __ompt_team_assign_id(team, ompt_parallel_data);
5267 #endif
5268 
5269       KMP_MB();
5270 
5271       return team;
5272     }
5273 
5274     /* reap team if it is too small, then loop back and check the next one */
5275     // not sure if this is wise, but, will be redone during the hot-teams
5276     // rewrite.
5277     /* TODO: Use technique to find the right size hot-team, don't reap them */
5278     team = __kmp_reap_team(team);
5279     __kmp_team_pool = team;
5280   }
5281 
5282   /* nothing available in the pool, no matter, make a new team! */
5283   KMP_MB();
5284   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5285 
5286   /* and set it up */
5287   team->t.t_max_nproc = max_nproc;
5288   /* NOTE well, for some reason allocating one big buffer and dividing it up
5289      seems to really hurt performance a lot on the P4, so, let's not use this */
5290   __kmp_allocate_team_arrays(team, max_nproc);
5291 
5292   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5293   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5294 
5295   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5296                 "%p to NULL\n",
5297                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5298   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5299   // memory, no need to duplicate
5300   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5301   // memory, no need to duplicate
5302 
5303   if (__kmp_storage_map) {
5304     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5305   }
5306 
5307   /* allocate space for arguments */
5308   __kmp_alloc_argv_entries(argc, team, FALSE);
5309   team->t.t_argc = argc;
5310 
5311   KA_TRACE(20,
5312            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5313             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5314   { // Initialize barrier data.
5315     int b;
5316     for (b = 0; b < bs_last_barrier; ++b) {
5317       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5318 #if USE_DEBUGGER
5319       team->t.t_bar[b].b_master_arrived = 0;
5320       team->t.t_bar[b].b_team_arrived = 0;
5321 #endif
5322     }
5323   }
5324 
5325 #if OMP_40_ENABLED
5326   team->t.t_proc_bind = new_proc_bind;
5327 #endif
5328 
5329 #if OMPT_SUPPORT
5330   __ompt_team_assign_id(team, ompt_parallel_data);
5331   team->t.ompt_serialized_team_info = NULL;
5332 #endif
5333 
5334   KMP_MB();
5335 
5336   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5337                 team->t.t_id));
5338 
5339   return team;
5340 }
5341 
5342 /* TODO implement hot-teams at all levels */
5343 /* TODO implement lazy thread release on demand (disband request) */
5344 
5345 /* free the team.  return it to the team pool.  release all the threads
5346  * associated with it */
5347 void __kmp_free_team(kmp_root_t *root,
5348                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5349   int f;
5350   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5351                 team->t.t_id));
5352 
5353   /* verify state */
5354   KMP_DEBUG_ASSERT(root);
5355   KMP_DEBUG_ASSERT(team);
5356   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5357   KMP_DEBUG_ASSERT(team->t.t_threads);
5358 
5359   int use_hot_team = team == root->r.r_hot_team;
5360 #if KMP_NESTED_HOT_TEAMS
5361   int level;
5362   kmp_hot_team_ptr_t *hot_teams;
5363   if (master) {
5364     level = team->t.t_active_level - 1;
5365     if (master->th.th_teams_microtask) { // in teams construct?
5366       if (master->th.th_teams_size.nteams > 1) {
5367         ++level; // level was not increased in teams construct for
5368         // team_of_masters
5369       }
5370       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5371           master->th.th_teams_level == team->t.t_level) {
5372         ++level; // level was not increased in teams construct for
5373         // team_of_workers before the parallel
5374       } // team->t.t_level will be increased inside parallel
5375     }
5376     hot_teams = master->th.th_hot_teams;
5377     if (level < __kmp_hot_teams_max_level) {
5378       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5379       use_hot_team = 1;
5380     }
5381   }
5382 #endif // KMP_NESTED_HOT_TEAMS
5383 
5384   /* team is done working */
5385   TCW_SYNC_PTR(team->t.t_pkfn,
5386                NULL); // Important for Debugging Support Library.
5387   team->t.t_copyin_counter = 0; // init counter for possible reuse
5388   // Do not reset pointer to parent team to NULL for hot teams.
5389 
5390   /* if we are non-hot team, release our threads */
5391   if (!use_hot_team) {
5392     if (__kmp_tasking_mode != tskm_immediate_exec) {
5393       // Wait for threads to reach reapable state
5394       for (f = 1; f < team->t.t_nproc; ++f) {
5395         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5396         kmp_info_t *th = team->t.t_threads[f];
5397         volatile kmp_uint32 *state = &th->th.th_reap_state;
5398         while (*state != KMP_SAFE_TO_REAP) {
5399 #if KMP_OS_WINDOWS
5400           // On Windows a thread can be killed at any time, check this
5401           DWORD ecode;
5402           if (!__kmp_is_thread_alive(th, &ecode)) {
5403             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5404             break;
5405           }
5406 #endif
5407           // first check if thread is sleeping
5408           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5409           if (fl.is_sleeping())
5410             fl.resume(__kmp_gtid_from_thread(th));
5411           KMP_CPU_PAUSE();
5412         }
5413       }
5414 
5415       // Delete task teams
5416       int tt_idx;
5417       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5418         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5419         if (task_team != NULL) {
5420           for (f = 0; f < team->t.t_nproc;
5421                ++f) { // Have all threads unref task teams
5422             team->t.t_threads[f]->th.th_task_team = NULL;
5423           }
5424           KA_TRACE(
5425               20,
5426               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5427                __kmp_get_gtid(), task_team, team->t.t_id));
5428 #if KMP_NESTED_HOT_TEAMS
5429           __kmp_free_task_team(master, task_team);
5430 #endif
5431           team->t.t_task_team[tt_idx] = NULL;
5432         }
5433       }
5434     }
5435 
5436     // Reset pointer to parent team only for non-hot teams.
5437     team->t.t_parent = NULL;
5438     team->t.t_level = 0;
5439     team->t.t_active_level = 0;
5440 
5441     /* free the worker threads */
5442     for (f = 1; f < team->t.t_nproc; ++f) {
5443       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5444       __kmp_free_thread(team->t.t_threads[f]);
5445       team->t.t_threads[f] = NULL;
5446     }
5447 
5448     /* put the team back in the team pool */
5449     /* TODO limit size of team pool, call reap_team if pool too large */
5450     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5451     __kmp_team_pool = (volatile kmp_team_t *)team;
5452   }
5453 
5454   KMP_MB();
5455 }
5456 
5457 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5458 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5459   kmp_team_t *next_pool = team->t.t_next_pool;
5460 
5461   KMP_DEBUG_ASSERT(team);
5462   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5463   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5464   KMP_DEBUG_ASSERT(team->t.t_threads);
5465   KMP_DEBUG_ASSERT(team->t.t_argv);
5466 
5467   /* TODO clean the threads that are a part of this? */
5468 
5469   /* free stuff */
5470   __kmp_free_team_arrays(team);
5471   if (team->t.t_argv != &team->t.t_inline_argv[0])
5472     __kmp_free((void *)team->t.t_argv);
5473   __kmp_free(team);
5474 
5475   KMP_MB();
5476   return next_pool;
5477 }
5478 
5479 // Free the thread.  Don't reap it, just place it on the pool of available
5480 // threads.
5481 //
5482 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5483 // binding for the affinity mechanism to be useful.
5484 //
5485 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5486 // However, we want to avoid a potential performance problem by always
5487 // scanning through the list to find the correct point at which to insert
5488 // the thread (potential N**2 behavior).  To do this we keep track of the
5489 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5490 // With single-level parallelism, threads will always be added to the tail
5491 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5492 // parallelism, all bets are off and we may need to scan through the entire
5493 // free list.
5494 //
5495 // This change also has a potentially large performance benefit, for some
5496 // applications.  Previously, as threads were freed from the hot team, they
5497 // would be placed back on the free list in inverse order.  If the hot team
5498 // grew back to it's original size, then the freed thread would be placed
5499 // back on the hot team in reverse order.  This could cause bad cache
5500 // locality problems on programs where the size of the hot team regularly
5501 // grew and shrunk.
5502 //
5503 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5504 void __kmp_free_thread(kmp_info_t *this_th) {
5505   int gtid;
5506   kmp_info_t **scan;
5507   kmp_root_t *root = this_th->th.th_root;
5508 
5509   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5510                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5511 
5512   KMP_DEBUG_ASSERT(this_th);
5513 
5514   // When moving thread to pool, switch thread to wait on own b_go flag, and
5515   // uninitialized (NULL team).
5516   int b;
5517   kmp_balign_t *balign = this_th->th.th_bar;
5518   for (b = 0; b < bs_last_barrier; ++b) {
5519     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5520       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5521     balign[b].bb.team = NULL;
5522     balign[b].bb.leaf_kids = 0;
5523   }
5524   this_th->th.th_task_state = 0;
5525 
5526   /* put thread back on the free pool */
5527   TCW_PTR(this_th->th.th_team, NULL);
5528   TCW_PTR(this_th->th.th_root, NULL);
5529   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5530 
5531   // If the __kmp_thread_pool_insert_pt is already past the new insert
5532   // point, then we need to re-scan the entire list.
5533   gtid = this_th->th.th_info.ds.ds_gtid;
5534   if (__kmp_thread_pool_insert_pt != NULL) {
5535     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5536     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5537       __kmp_thread_pool_insert_pt = NULL;
5538     }
5539   }
5540 
5541   // Scan down the list to find the place to insert the thread.
5542   // scan is the address of a link in the list, possibly the address of
5543   // __kmp_thread_pool itself.
5544   //
5545   // In the absence of nested parallism, the for loop will have 0 iterations.
5546   if (__kmp_thread_pool_insert_pt != NULL) {
5547     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5548   } else {
5549     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5550   }
5551   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5552        scan = &((*scan)->th.th_next_pool))
5553     ;
5554 
5555   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5556   // to its address.
5557   TCW_PTR(this_th->th.th_next_pool, *scan);
5558   __kmp_thread_pool_insert_pt = *scan = this_th;
5559   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5560                    (this_th->th.th_info.ds.ds_gtid <
5561                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5562   TCW_4(this_th->th.th_in_pool, TRUE);
5563   __kmp_thread_pool_nth++;
5564 
5565   TCW_4(__kmp_nth, __kmp_nth - 1);
5566   root->r.r_cg_nthreads--;
5567 
5568 #ifdef KMP_ADJUST_BLOCKTIME
5569   /* Adjust blocktime back to user setting or default if necessary */
5570   /* Middle initialization might never have occurred                */
5571   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5572     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5573     if (__kmp_nth <= __kmp_avail_proc) {
5574       __kmp_zero_bt = FALSE;
5575     }
5576   }
5577 #endif /* KMP_ADJUST_BLOCKTIME */
5578 
5579   KMP_MB();
5580 }
5581 
5582 /* ------------------------------------------------------------------------ */
5583 
5584 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5585   int gtid = this_thr->th.th_info.ds.ds_gtid;
5586   /*    void                 *stack_data;*/
5587   kmp_team_t *(*volatile pteam);
5588 
5589   KMP_MB();
5590   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5591 
5592   if (__kmp_env_consistency_check) {
5593     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5594   }
5595 
5596 #if OMPT_SUPPORT
5597   ompt_data_t *thread_data;
5598   if (ompt_enabled.enabled) {
5599     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5600     thread_data->ptr = NULL;
5601 
5602     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5603     this_thr->th.ompt_thread_info.wait_id = 0;
5604     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5605     if (ompt_enabled.ompt_callback_thread_begin) {
5606       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5607           ompt_thread_worker, thread_data);
5608     }
5609   }
5610 #endif
5611 
5612 #if OMPT_SUPPORT
5613   if (ompt_enabled.enabled) {
5614     this_thr->th.ompt_thread_info.state = omp_state_idle;
5615   }
5616 #endif
5617   /* This is the place where threads wait for work */
5618   while (!TCR_4(__kmp_global.g.g_done)) {
5619     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5620     KMP_MB();
5621 
5622     /* wait for work to do */
5623     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5624 
5625     /* No tid yet since not part of a team */
5626     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5627 
5628 #if OMPT_SUPPORT
5629     if (ompt_enabled.enabled) {
5630       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5631     }
5632 #endif
5633 
5634     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5635 
5636     /* have we been allocated? */
5637     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5638       /* we were just woken up, so run our new task */
5639       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5640         int rc;
5641         KA_TRACE(20,
5642                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5643                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5644                   (*pteam)->t.t_pkfn));
5645 
5646         updateHWFPControl(*pteam);
5647 
5648 #if OMPT_SUPPORT
5649         if (ompt_enabled.enabled) {
5650           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5651         }
5652 #endif
5653 
5654         {
5655           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5656           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5657           rc = (*pteam)->t.t_invoke(gtid);
5658         }
5659         KMP_ASSERT(rc);
5660 
5661         KMP_MB();
5662         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5663                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5664                       (*pteam)->t.t_pkfn));
5665       }
5666 #if OMPT_SUPPORT
5667       if (ompt_enabled.enabled) {
5668         /* no frame set while outside task */
5669         __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5670 
5671         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5672         this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
5673       }
5674 #endif
5675       /* join barrier after parallel region */
5676       __kmp_join_barrier(gtid);
5677     }
5678   }
5679   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5680 
5681 #if OMPT_SUPPORT
5682   if (ompt_enabled.ompt_callback_thread_end) {
5683     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5684   }
5685 #endif
5686 
5687   this_thr->th.th_task_team = NULL;
5688   /* run the destructors for the threadprivate data for this thread */
5689   __kmp_common_destroy_gtid(gtid);
5690 
5691   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5692   KMP_MB();
5693   return this_thr;
5694 }
5695 
5696 /* ------------------------------------------------------------------------ */
5697 
5698 void __kmp_internal_end_dest(void *specific_gtid) {
5699 #if KMP_COMPILER_ICC
5700 #pragma warning(push)
5701 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5702 // significant bits
5703 #endif
5704   // Make sure no significant bits are lost
5705   int gtid = (kmp_intptr_t)specific_gtid - 1;
5706 #if KMP_COMPILER_ICC
5707 #pragma warning(pop)
5708 #endif
5709 
5710   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5711   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5712    * this is because 0 is reserved for the nothing-stored case */
5713 
5714   /* josh: One reason for setting the gtid specific data even when it is being
5715      destroyed by pthread is to allow gtid lookup through thread specific data
5716      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5717      that gets executed in the call to __kmp_internal_end_thread, actually
5718      gets the gtid through the thread specific data.  Setting it here seems
5719      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5720      to run smoothly.
5721      todo: get rid of this after we remove the dependence on
5722      __kmp_gtid_get_specific  */
5723   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5724     __kmp_gtid_set_specific(gtid);
5725 #ifdef KMP_TDATA_GTID
5726   __kmp_gtid = gtid;
5727 #endif
5728   __kmp_internal_end_thread(gtid);
5729 }
5730 
5731 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5732 
5733 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5734 // destructors work perfectly, but in real libomp.so I have no evidence it is
5735 // ever called. However, -fini linker option in makefile.mk works fine.
5736 
5737 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5738   __kmp_internal_end_atexit();
5739 }
5740 
5741 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5742 
5743 #endif
5744 
5745 /* [Windows] josh: when the atexit handler is called, there may still be more
5746    than one thread alive */
5747 void __kmp_internal_end_atexit(void) {
5748   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5749   /* [Windows]
5750      josh: ideally, we want to completely shutdown the library in this atexit
5751      handler, but stat code that depends on thread specific data for gtid fails
5752      because that data becomes unavailable at some point during the shutdown, so
5753      we call __kmp_internal_end_thread instead. We should eventually remove the
5754      dependency on __kmp_get_specific_gtid in the stat code and use
5755      __kmp_internal_end_library to cleanly shutdown the library.
5756 
5757      // TODO: Can some of this comment about GVS be removed?
5758      I suspect that the offending stat code is executed when the calling thread
5759      tries to clean up a dead root thread's data structures, resulting in GVS
5760      code trying to close the GVS structures for that thread, but since the stat
5761      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5762      the calling thread is cleaning up itself instead of another thread, it get
5763      confused. This happens because allowing a thread to unregister and cleanup
5764      another thread is a recent modification for addressing an issue.
5765      Based on the current design (20050722), a thread may end up
5766      trying to unregister another thread only if thread death does not trigger
5767      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5768      thread specific data destructor function to detect thread death. For
5769      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5770      is nothing.  Thus, the workaround is applicable only for Windows static
5771      stat library. */
5772   __kmp_internal_end_library(-1);
5773 #if KMP_OS_WINDOWS
5774   __kmp_close_console();
5775 #endif
5776 }
5777 
5778 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5779   // It is assumed __kmp_forkjoin_lock is acquired.
5780 
5781   int gtid;
5782 
5783   KMP_DEBUG_ASSERT(thread != NULL);
5784 
5785   gtid = thread->th.th_info.ds.ds_gtid;
5786 
5787   if (!is_root) {
5788 
5789     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5790       /* Assume the threads are at the fork barrier here */
5791       KA_TRACE(
5792           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5793                gtid));
5794       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5795        * (GEH) */
5796       ANNOTATE_HAPPENS_BEFORE(thread);
5797       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5798       __kmp_release_64(&flag);
5799     }
5800 
5801     // Terminate OS thread.
5802     __kmp_reap_worker(thread);
5803 
5804     // The thread was killed asynchronously.  If it was actively
5805     // spinning in the thread pool, decrement the global count.
5806     //
5807     // There is a small timing hole here - if the worker thread was just waking
5808     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5809     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5810     // the global counter might not get updated.
5811     //
5812     // Currently, this can only happen as the library is unloaded,
5813     // so there are no harmful side effects.
5814     if (thread->th.th_active_in_pool) {
5815       thread->th.th_active_in_pool = FALSE;
5816       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5817       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5818     }
5819 
5820     // Decrement # of [worker] threads in the pool.
5821     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5822     --__kmp_thread_pool_nth;
5823   }
5824 
5825   __kmp_free_implicit_task(thread);
5826 
5827 // Free the fast memory for tasking
5828 #if USE_FAST_MEMORY
5829   __kmp_free_fast_memory(thread);
5830 #endif /* USE_FAST_MEMORY */
5831 
5832   __kmp_suspend_uninitialize_thread(thread);
5833 
5834   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5835   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5836 
5837   --__kmp_all_nth;
5838 // __kmp_nth was decremented when thread is added to the pool.
5839 
5840 #ifdef KMP_ADJUST_BLOCKTIME
5841   /* Adjust blocktime back to user setting or default if necessary */
5842   /* Middle initialization might never have occurred                */
5843   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5844     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5845     if (__kmp_nth <= __kmp_avail_proc) {
5846       __kmp_zero_bt = FALSE;
5847     }
5848   }
5849 #endif /* KMP_ADJUST_BLOCKTIME */
5850 
5851   /* free the memory being used */
5852   if (__kmp_env_consistency_check) {
5853     if (thread->th.th_cons) {
5854       __kmp_free_cons_stack(thread->th.th_cons);
5855       thread->th.th_cons = NULL;
5856     }
5857   }
5858 
5859   if (thread->th.th_pri_common != NULL) {
5860     __kmp_free(thread->th.th_pri_common);
5861     thread->th.th_pri_common = NULL;
5862   }
5863 
5864   if (thread->th.th_task_state_memo_stack != NULL) {
5865     __kmp_free(thread->th.th_task_state_memo_stack);
5866     thread->th.th_task_state_memo_stack = NULL;
5867   }
5868 
5869 #if KMP_USE_BGET
5870   if (thread->th.th_local.bget_data != NULL) {
5871     __kmp_finalize_bget(thread);
5872   }
5873 #endif
5874 
5875 #if KMP_AFFINITY_SUPPORTED
5876   if (thread->th.th_affin_mask != NULL) {
5877     KMP_CPU_FREE(thread->th.th_affin_mask);
5878     thread->th.th_affin_mask = NULL;
5879   }
5880 #endif /* KMP_AFFINITY_SUPPORTED */
5881 
5882   __kmp_reap_team(thread->th.th_serial_team);
5883   thread->th.th_serial_team = NULL;
5884   __kmp_free(thread);
5885 
5886   KMP_MB();
5887 
5888 } // __kmp_reap_thread
5889 
5890 static void __kmp_internal_end(void) {
5891   int i;
5892 
5893   /* First, unregister the library */
5894   __kmp_unregister_library();
5895 
5896 #if KMP_OS_WINDOWS
5897   /* In Win static library, we can't tell when a root actually dies, so we
5898      reclaim the data structures for any root threads that have died but not
5899      unregistered themselves, in order to shut down cleanly.
5900      In Win dynamic library we also can't tell when a thread dies.  */
5901   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5902 // dead roots
5903 #endif
5904 
5905   for (i = 0; i < __kmp_threads_capacity; i++)
5906     if (__kmp_root[i])
5907       if (__kmp_root[i]->r.r_active)
5908         break;
5909   KMP_MB(); /* Flush all pending memory write invalidates.  */
5910   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5911 
5912   if (i < __kmp_threads_capacity) {
5913 #if KMP_USE_MONITOR
5914     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5915     KMP_MB(); /* Flush all pending memory write invalidates.  */
5916 
5917     // Need to check that monitor was initialized before reaping it. If we are
5918     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5919     // __kmp_monitor will appear to contain valid data, but it is only valid in
5920     // the parent process, not the child.
5921     // New behavior (201008): instead of keying off of the flag
5922     // __kmp_init_parallel, the monitor thread creation is keyed off
5923     // of the new flag __kmp_init_monitor.
5924     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5925     if (TCR_4(__kmp_init_monitor)) {
5926       __kmp_reap_monitor(&__kmp_monitor);
5927       TCW_4(__kmp_init_monitor, 0);
5928     }
5929     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5930     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5931 #endif // KMP_USE_MONITOR
5932   } else {
5933 /* TODO move this to cleanup code */
5934 #ifdef KMP_DEBUG
5935     /* make sure that everything has properly ended */
5936     for (i = 0; i < __kmp_threads_capacity; i++) {
5937       if (__kmp_root[i]) {
5938         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5939         //                    there can be uber threads alive here
5940         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5941       }
5942     }
5943 #endif
5944 
5945     KMP_MB();
5946 
5947     // Reap the worker threads.
5948     // This is valid for now, but be careful if threads are reaped sooner.
5949     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5950       // Get the next thread from the pool.
5951       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5952       __kmp_thread_pool = thread->th.th_next_pool;
5953       // Reap it.
5954       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5955       thread->th.th_next_pool = NULL;
5956       thread->th.th_in_pool = FALSE;
5957       __kmp_reap_thread(thread, 0);
5958     }
5959     __kmp_thread_pool_insert_pt = NULL;
5960 
5961     // Reap teams.
5962     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5963       // Get the next team from the pool.
5964       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5965       __kmp_team_pool = team->t.t_next_pool;
5966       // Reap it.
5967       team->t.t_next_pool = NULL;
5968       __kmp_reap_team(team);
5969     }
5970 
5971     __kmp_reap_task_teams();
5972 
5973     for (i = 0; i < __kmp_threads_capacity; ++i) {
5974       // TBD: Add some checking...
5975       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5976     }
5977 
5978     /* Make sure all threadprivate destructors get run by joining with all
5979        worker threads before resetting this flag */
5980     TCW_SYNC_4(__kmp_init_common, FALSE);
5981 
5982     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5983     KMP_MB();
5984 
5985 #if KMP_USE_MONITOR
5986     // See note above: One of the possible fixes for CQ138434 / CQ140126
5987     //
5988     // FIXME: push both code fragments down and CSE them?
5989     // push them into __kmp_cleanup() ?
5990     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5991     if (TCR_4(__kmp_init_monitor)) {
5992       __kmp_reap_monitor(&__kmp_monitor);
5993       TCW_4(__kmp_init_monitor, 0);
5994     }
5995     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5996     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5997 #endif
5998   } /* else !__kmp_global.t_active */
5999   TCW_4(__kmp_init_gtid, FALSE);
6000   KMP_MB(); /* Flush all pending memory write invalidates.  */
6001 
6002   __kmp_cleanup();
6003 #if OMPT_SUPPORT
6004   ompt_fini();
6005 #endif
6006 }
6007 
6008 void __kmp_internal_end_library(int gtid_req) {
6009   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6010   /* this shouldn't be a race condition because __kmp_internal_end() is the
6011      only place to clear __kmp_serial_init */
6012   /* we'll check this later too, after we get the lock */
6013   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6014   // redundaant, because the next check will work in any case.
6015   if (__kmp_global.g.g_abort) {
6016     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6017     /* TODO abort? */
6018     return;
6019   }
6020   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6021     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6022     return;
6023   }
6024 
6025   KMP_MB(); /* Flush all pending memory write invalidates.  */
6026 
6027   /* find out who we are and what we should do */
6028   {
6029     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6030     KA_TRACE(
6031         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6032     if (gtid == KMP_GTID_SHUTDOWN) {
6033       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6034                     "already shutdown\n"));
6035       return;
6036     } else if (gtid == KMP_GTID_MONITOR) {
6037       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6038                     "registered, or system shutdown\n"));
6039       return;
6040     } else if (gtid == KMP_GTID_DNE) {
6041       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6042                     "shutdown\n"));
6043       /* we don't know who we are, but we may still shutdown the library */
6044     } else if (KMP_UBER_GTID(gtid)) {
6045       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6046       if (__kmp_root[gtid]->r.r_active) {
6047         __kmp_global.g.g_abort = -1;
6048         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6049         KA_TRACE(10,
6050                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6051                   gtid));
6052         return;
6053       } else {
6054         KA_TRACE(
6055             10,
6056             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6057         __kmp_unregister_root_current_thread(gtid);
6058       }
6059     } else {
6060 /* worker threads may call this function through the atexit handler, if they
6061  * call exit() */
6062 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6063    TODO: do a thorough shutdown instead */
6064 #ifdef DUMP_DEBUG_ON_EXIT
6065       if (__kmp_debug_buf)
6066         __kmp_dump_debug_buffer();
6067 #endif
6068       return;
6069     }
6070   }
6071   /* synchronize the termination process */
6072   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6073 
6074   /* have we already finished */
6075   if (__kmp_global.g.g_abort) {
6076     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6077     /* TODO abort? */
6078     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6079     return;
6080   }
6081   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6082     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6083     return;
6084   }
6085 
6086   /* We need this lock to enforce mutex between this reading of
6087      __kmp_threads_capacity and the writing by __kmp_register_root.
6088      Alternatively, we can use a counter of roots that is atomically updated by
6089      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6090      __kmp_internal_end_*.  */
6091   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6092 
6093   /* now we can safely conduct the actual termination */
6094   __kmp_internal_end();
6095 
6096   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6097   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6098 
6099   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6100 
6101 #ifdef DUMP_DEBUG_ON_EXIT
6102   if (__kmp_debug_buf)
6103     __kmp_dump_debug_buffer();
6104 #endif
6105 
6106 #if KMP_OS_WINDOWS
6107   __kmp_close_console();
6108 #endif
6109 
6110   __kmp_fini_allocator();
6111 
6112 } // __kmp_internal_end_library
6113 
6114 void __kmp_internal_end_thread(int gtid_req) {
6115   int i;
6116 
6117   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6118   /* this shouldn't be a race condition because __kmp_internal_end() is the
6119    * only place to clear __kmp_serial_init */
6120   /* we'll check this later too, after we get the lock */
6121   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6122   // redundant, because the next check will work in any case.
6123   if (__kmp_global.g.g_abort) {
6124     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6125     /* TODO abort? */
6126     return;
6127   }
6128   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6129     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6130     return;
6131   }
6132 
6133   KMP_MB(); /* Flush all pending memory write invalidates.  */
6134 
6135   /* find out who we are and what we should do */
6136   {
6137     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6138     KA_TRACE(10,
6139              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6140     if (gtid == KMP_GTID_SHUTDOWN) {
6141       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6142                     "already shutdown\n"));
6143       return;
6144     } else if (gtid == KMP_GTID_MONITOR) {
6145       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6146                     "registered, or system shutdown\n"));
6147       return;
6148     } else if (gtid == KMP_GTID_DNE) {
6149       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6150                     "shutdown\n"));
6151       return;
6152       /* we don't know who we are */
6153     } else if (KMP_UBER_GTID(gtid)) {
6154       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6155       if (__kmp_root[gtid]->r.r_active) {
6156         __kmp_global.g.g_abort = -1;
6157         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6158         KA_TRACE(10,
6159                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6160                   gtid));
6161         return;
6162       } else {
6163         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6164                       gtid));
6165         __kmp_unregister_root_current_thread(gtid);
6166       }
6167     } else {
6168       /* just a worker thread, let's leave */
6169       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6170 
6171       if (gtid >= 0) {
6172         __kmp_threads[gtid]->th.th_task_team = NULL;
6173       }
6174 
6175       KA_TRACE(10,
6176                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6177                 gtid));
6178       return;
6179     }
6180   }
6181 #if defined KMP_DYNAMIC_LIB
6182   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6183   // thread, because we will better shutdown later in the library destructor.
6184   // The reason of this change is performance problem when non-openmp thread in
6185   // a loop forks and joins many openmp threads. We can save a lot of time
6186   // keeping worker threads alive until the program shutdown.
6187   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6188   // and Windows(DPD200287443) that occurs when using critical sections from
6189   // foreign threads.
6190   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6191   return;
6192 #endif
6193   /* synchronize the termination process */
6194   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6195 
6196   /* have we already finished */
6197   if (__kmp_global.g.g_abort) {
6198     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6199     /* TODO abort? */
6200     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6201     return;
6202   }
6203   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6204     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6205     return;
6206   }
6207 
6208   /* We need this lock to enforce mutex between this reading of
6209      __kmp_threads_capacity and the writing by __kmp_register_root.
6210      Alternatively, we can use a counter of roots that is atomically updated by
6211      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6212      __kmp_internal_end_*.  */
6213 
6214   /* should we finish the run-time?  are all siblings done? */
6215   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6216 
6217   for (i = 0; i < __kmp_threads_capacity; ++i) {
6218     if (KMP_UBER_GTID(i)) {
6219       KA_TRACE(
6220           10,
6221           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6222       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6223       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6224       return;
6225     }
6226   }
6227 
6228   /* now we can safely conduct the actual termination */
6229 
6230   __kmp_internal_end();
6231 
6232   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6233   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6234 
6235   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6236 
6237 #ifdef DUMP_DEBUG_ON_EXIT
6238   if (__kmp_debug_buf)
6239     __kmp_dump_debug_buffer();
6240 #endif
6241 } // __kmp_internal_end_thread
6242 
6243 // -----------------------------------------------------------------------------
6244 // Library registration stuff.
6245 
6246 static long __kmp_registration_flag = 0;
6247 // Random value used to indicate library initialization.
6248 static char *__kmp_registration_str = NULL;
6249 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6250 
6251 static inline char *__kmp_reg_status_name() {
6252   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6253      each thread. If registration and unregistration go in different threads
6254      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6255      env var can not be found, because the name will contain different pid. */
6256   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6257 } // __kmp_reg_status_get
6258 
6259 void __kmp_register_library_startup(void) {
6260 
6261   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6262   int done = 0;
6263   union {
6264     double dtime;
6265     long ltime;
6266   } time;
6267 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6268   __kmp_initialize_system_tick();
6269 #endif
6270   __kmp_read_system_time(&time.dtime);
6271   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6272   __kmp_registration_str =
6273       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6274                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6275 
6276   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6277                 __kmp_registration_str));
6278 
6279   while (!done) {
6280 
6281     char *value = NULL; // Actual value of the environment variable.
6282 
6283     // Set environment variable, but do not overwrite if it is exist.
6284     __kmp_env_set(name, __kmp_registration_str, 0);
6285     // Check the variable is written.
6286     value = __kmp_env_get(name);
6287     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6288 
6289       done = 1; // Ok, environment variable set successfully, exit the loop.
6290 
6291     } else {
6292 
6293       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6294       // Check whether it alive or dead.
6295       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6296       char *tail = value;
6297       char *flag_addr_str = NULL;
6298       char *flag_val_str = NULL;
6299       char const *file_name = NULL;
6300       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6301       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6302       file_name = tail;
6303       if (tail != NULL) {
6304         long *flag_addr = 0;
6305         long flag_val = 0;
6306         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6307         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6308         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6309           // First, check whether environment-encoded address is mapped into
6310           // addr space.
6311           // If so, dereference it to see if it still has the right value.
6312           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6313             neighbor = 1;
6314           } else {
6315             // If not, then we know the other copy of the library is no longer
6316             // running.
6317             neighbor = 2;
6318           }
6319         }
6320       }
6321       switch (neighbor) {
6322       case 0: // Cannot parse environment variable -- neighbor status unknown.
6323         // Assume it is the incompatible format of future version of the
6324         // library. Assume the other library is alive.
6325         // WARN( ... ); // TODO: Issue a warning.
6326         file_name = "unknown library";
6327       // Attention! Falling to the next case. That's intentional.
6328       case 1: { // Neighbor is alive.
6329         // Check it is allowed.
6330         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6331         if (!__kmp_str_match_true(duplicate_ok)) {
6332           // That's not allowed. Issue fatal error.
6333           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6334                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6335         }
6336         KMP_INTERNAL_FREE(duplicate_ok);
6337         __kmp_duplicate_library_ok = 1;
6338         done = 1; // Exit the loop.
6339       } break;
6340       case 2: { // Neighbor is dead.
6341         // Clear the variable and try to register library again.
6342         __kmp_env_unset(name);
6343       } break;
6344       default: { KMP_DEBUG_ASSERT(0); } break;
6345       }
6346     }
6347     KMP_INTERNAL_FREE((void *)value);
6348   }
6349   KMP_INTERNAL_FREE((void *)name);
6350 
6351 } // func __kmp_register_library_startup
6352 
6353 void __kmp_unregister_library(void) {
6354 
6355   char *name = __kmp_reg_status_name();
6356   char *value = __kmp_env_get(name);
6357 
6358   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6359   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6360   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6361     // Ok, this is our variable. Delete it.
6362     __kmp_env_unset(name);
6363   }
6364 
6365   KMP_INTERNAL_FREE(__kmp_registration_str);
6366   KMP_INTERNAL_FREE(value);
6367   KMP_INTERNAL_FREE(name);
6368 
6369   __kmp_registration_flag = 0;
6370   __kmp_registration_str = NULL;
6371 
6372 } // __kmp_unregister_library
6373 
6374 // End of Library registration stuff.
6375 // -----------------------------------------------------------------------------
6376 
6377 #if KMP_MIC_SUPPORTED
6378 
6379 static void __kmp_check_mic_type() {
6380   kmp_cpuid_t cpuid_state = {0};
6381   kmp_cpuid_t *cs_p = &cpuid_state;
6382   __kmp_x86_cpuid(1, 0, cs_p);
6383   // We don't support mic1 at the moment
6384   if ((cs_p->eax & 0xff0) == 0xB10) {
6385     __kmp_mic_type = mic2;
6386   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6387     __kmp_mic_type = mic3;
6388   } else {
6389     __kmp_mic_type = non_mic;
6390   }
6391 }
6392 
6393 #endif /* KMP_MIC_SUPPORTED */
6394 
6395 static void __kmp_do_serial_initialize(void) {
6396   int i, gtid;
6397   int size;
6398 
6399   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6400 
6401   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6402   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6403   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6404   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6405   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6406 
6407 #if OMPT_SUPPORT
6408   ompt_pre_init();
6409 #endif
6410 
6411   __kmp_validate_locks();
6412 
6413   /* Initialize internal memory allocator */
6414   __kmp_init_allocator();
6415 
6416   /* Register the library startup via an environment variable and check to see
6417      whether another copy of the library is already registered. */
6418 
6419   __kmp_register_library_startup();
6420 
6421   /* TODO reinitialization of library */
6422   if (TCR_4(__kmp_global.g.g_done)) {
6423     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6424   }
6425 
6426   __kmp_global.g.g_abort = 0;
6427   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6428 
6429 /* initialize the locks */
6430 #if KMP_USE_ADAPTIVE_LOCKS
6431 #if KMP_DEBUG_ADAPTIVE_LOCKS
6432   __kmp_init_speculative_stats();
6433 #endif
6434 #endif
6435 #if KMP_STATS_ENABLED
6436   __kmp_stats_init();
6437 #endif
6438   __kmp_init_lock(&__kmp_global_lock);
6439   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6440   __kmp_init_lock(&__kmp_debug_lock);
6441   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6442   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6443   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6444   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6445   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6446   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6447   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6448   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6449   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6450   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6451   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6452   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6453   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6454   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6455   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6456 #if KMP_USE_MONITOR
6457   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6458 #endif
6459   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6460 
6461   /* conduct initialization and initial setup of configuration */
6462 
6463   __kmp_runtime_initialize();
6464 
6465 #if KMP_MIC_SUPPORTED
6466   __kmp_check_mic_type();
6467 #endif
6468 
6469 // Some global variable initialization moved here from kmp_env_initialize()
6470 #ifdef KMP_DEBUG
6471   kmp_diag = 0;
6472 #endif
6473   __kmp_abort_delay = 0;
6474 
6475   // From __kmp_init_dflt_team_nth()
6476   /* assume the entire machine will be used */
6477   __kmp_dflt_team_nth_ub = __kmp_xproc;
6478   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6479     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6480   }
6481   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6482     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6483   }
6484   __kmp_max_nth = __kmp_sys_max_nth;
6485   __kmp_cg_max_nth = __kmp_sys_max_nth;
6486   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6487   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6488     __kmp_teams_max_nth = __kmp_sys_max_nth;
6489   }
6490 
6491   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6492   // part
6493   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6494 #if KMP_USE_MONITOR
6495   __kmp_monitor_wakeups =
6496       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6497   __kmp_bt_intervals =
6498       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6499 #endif
6500   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6501   __kmp_library = library_throughput;
6502   // From KMP_SCHEDULE initialization
6503   __kmp_static = kmp_sch_static_balanced;
6504 // AC: do not use analytical here, because it is non-monotonous
6505 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6506 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6507 // need to repeat assignment
6508 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6509 // bit control and barrier method control parts
6510 #if KMP_FAST_REDUCTION_BARRIER
6511 #define kmp_reduction_barrier_gather_bb ((int)1)
6512 #define kmp_reduction_barrier_release_bb ((int)1)
6513 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6514 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6515 #endif // KMP_FAST_REDUCTION_BARRIER
6516   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6517     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6518     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6519     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6520     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6521 #if KMP_FAST_REDUCTION_BARRIER
6522     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6523       // lin_64 ): hyper,1
6524       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6525       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6526       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6527       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6528     }
6529 #endif // KMP_FAST_REDUCTION_BARRIER
6530   }
6531 #if KMP_FAST_REDUCTION_BARRIER
6532 #undef kmp_reduction_barrier_release_pat
6533 #undef kmp_reduction_barrier_gather_pat
6534 #undef kmp_reduction_barrier_release_bb
6535 #undef kmp_reduction_barrier_gather_bb
6536 #endif // KMP_FAST_REDUCTION_BARRIER
6537 #if KMP_MIC_SUPPORTED
6538   if (__kmp_mic_type == mic2) { // KNC
6539     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6540     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6541     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6542         1; // forkjoin release
6543     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6544     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6545   }
6546 #if KMP_FAST_REDUCTION_BARRIER
6547   if (__kmp_mic_type == mic2) { // KNC
6548     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6549     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6550   }
6551 #endif // KMP_FAST_REDUCTION_BARRIER
6552 #endif // KMP_MIC_SUPPORTED
6553 
6554 // From KMP_CHECKS initialization
6555 #ifdef KMP_DEBUG
6556   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6557 #else
6558   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6559 #endif
6560 
6561   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6562   __kmp_foreign_tp = TRUE;
6563 
6564   __kmp_global.g.g_dynamic = FALSE;
6565   __kmp_global.g.g_dynamic_mode = dynamic_default;
6566 
6567   __kmp_env_initialize(NULL);
6568 
6569 // Print all messages in message catalog for testing purposes.
6570 #ifdef KMP_DEBUG
6571   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6572   if (__kmp_str_match_true(val)) {
6573     kmp_str_buf_t buffer;
6574     __kmp_str_buf_init(&buffer);
6575     __kmp_i18n_dump_catalog(&buffer);
6576     __kmp_printf("%s", buffer.str);
6577     __kmp_str_buf_free(&buffer);
6578   }
6579   __kmp_env_free(&val);
6580 #endif
6581 
6582   __kmp_threads_capacity =
6583       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6584   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6585   __kmp_tp_capacity = __kmp_default_tp_capacity(
6586       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6587 
6588   // If the library is shut down properly, both pools must be NULL. Just in
6589   // case, set them to NULL -- some memory may leak, but subsequent code will
6590   // work even if pools are not freed.
6591   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6592   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6593   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6594   __kmp_thread_pool = NULL;
6595   __kmp_thread_pool_insert_pt = NULL;
6596   __kmp_team_pool = NULL;
6597 
6598   /* Allocate all of the variable sized records */
6599   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6600    * expandable */
6601   /* Since allocation is cache-aligned, just add extra padding at the end */
6602   size =
6603       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6604       CACHE_LINE;
6605   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6606   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6607                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6608 
6609   /* init thread counts */
6610   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6611                    0); // Asserts fail if the library is reinitializing and
6612   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6613   __kmp_all_nth = 0;
6614   __kmp_nth = 0;
6615 
6616   /* setup the uber master thread and hierarchy */
6617   gtid = __kmp_register_root(TRUE);
6618   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6619   KMP_ASSERT(KMP_UBER_GTID(gtid));
6620   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6621 
6622   KMP_MB(); /* Flush all pending memory write invalidates.  */
6623 
6624   __kmp_common_initialize();
6625 
6626 #if KMP_OS_UNIX
6627   /* invoke the child fork handler */
6628   __kmp_register_atfork();
6629 #endif
6630 
6631 #if !defined KMP_DYNAMIC_LIB
6632   {
6633     /* Invoke the exit handler when the program finishes, only for static
6634        library. For dynamic library, we already have _fini and DllMain. */
6635     int rc = atexit(__kmp_internal_end_atexit);
6636     if (rc != 0) {
6637       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6638                   __kmp_msg_null);
6639     }
6640   }
6641 #endif
6642 
6643 #if KMP_HANDLE_SIGNALS
6644 #if KMP_OS_UNIX
6645   /* NOTE: make sure that this is called before the user installs their own
6646      signal handlers so that the user handlers are called first. this way they
6647      can return false, not call our handler, avoid terminating the library, and
6648      continue execution where they left off. */
6649   __kmp_install_signals(FALSE);
6650 #endif /* KMP_OS_UNIX */
6651 #if KMP_OS_WINDOWS
6652   __kmp_install_signals(TRUE);
6653 #endif /* KMP_OS_WINDOWS */
6654 #endif
6655 
6656   /* we have finished the serial initialization */
6657   __kmp_init_counter++;
6658 
6659   __kmp_init_serial = TRUE;
6660 
6661   if (__kmp_settings) {
6662     __kmp_env_print();
6663   }
6664 
6665 #if OMP_40_ENABLED
6666   if (__kmp_display_env || __kmp_display_env_verbose) {
6667     __kmp_env_print_2();
6668   }
6669 #endif // OMP_40_ENABLED
6670 
6671 #if OMPT_SUPPORT
6672   ompt_post_init();
6673 #endif
6674 
6675   KMP_MB();
6676 
6677   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6678 }
6679 
6680 void __kmp_serial_initialize(void) {
6681   if (__kmp_init_serial) {
6682     return;
6683   }
6684   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6685   if (__kmp_init_serial) {
6686     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6687     return;
6688   }
6689   __kmp_do_serial_initialize();
6690   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6691 }
6692 
6693 static void __kmp_do_middle_initialize(void) {
6694   int i, j;
6695   int prev_dflt_team_nth;
6696 
6697   if (!__kmp_init_serial) {
6698     __kmp_do_serial_initialize();
6699   }
6700 
6701   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6702 
6703   // Save the previous value for the __kmp_dflt_team_nth so that
6704   // we can avoid some reinitialization if it hasn't changed.
6705   prev_dflt_team_nth = __kmp_dflt_team_nth;
6706 
6707 #if KMP_AFFINITY_SUPPORTED
6708   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6709   // number of cores on the machine.
6710   __kmp_affinity_initialize();
6711 
6712   // Run through the __kmp_threads array and set the affinity mask
6713   // for each root thread that is currently registered with the RTL.
6714   for (i = 0; i < __kmp_threads_capacity; i++) {
6715     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6716       __kmp_affinity_set_init_mask(i, TRUE);
6717     }
6718   }
6719 #endif /* KMP_AFFINITY_SUPPORTED */
6720 
6721   KMP_ASSERT(__kmp_xproc > 0);
6722   if (__kmp_avail_proc == 0) {
6723     __kmp_avail_proc = __kmp_xproc;
6724   }
6725 
6726   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6727   // correct them now
6728   j = 0;
6729   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6730     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6731         __kmp_avail_proc;
6732     j++;
6733   }
6734 
6735   if (__kmp_dflt_team_nth == 0) {
6736 #ifdef KMP_DFLT_NTH_CORES
6737     // Default #threads = #cores
6738     __kmp_dflt_team_nth = __kmp_ncores;
6739     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6740                   "__kmp_ncores (%d)\n",
6741                   __kmp_dflt_team_nth));
6742 #else
6743     // Default #threads = #available OS procs
6744     __kmp_dflt_team_nth = __kmp_avail_proc;
6745     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6746                   "__kmp_avail_proc(%d)\n",
6747                   __kmp_dflt_team_nth));
6748 #endif /* KMP_DFLT_NTH_CORES */
6749   }
6750 
6751   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6752     __kmp_dflt_team_nth = KMP_MIN_NTH;
6753   }
6754   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6755     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6756   }
6757 
6758   // There's no harm in continuing if the following check fails,
6759   // but it indicates an error in the previous logic.
6760   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6761 
6762   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6763     // Run through the __kmp_threads array and set the num threads icv for each
6764     // root thread that is currently registered with the RTL (which has not
6765     // already explicitly set its nthreads-var with a call to
6766     // omp_set_num_threads()).
6767     for (i = 0; i < __kmp_threads_capacity; i++) {
6768       kmp_info_t *thread = __kmp_threads[i];
6769       if (thread == NULL)
6770         continue;
6771       if (thread->th.th_current_task->td_icvs.nproc != 0)
6772         continue;
6773 
6774       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6775     }
6776   }
6777   KA_TRACE(
6778       20,
6779       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6780        __kmp_dflt_team_nth));
6781 
6782 #ifdef KMP_ADJUST_BLOCKTIME
6783   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6784   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6785     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6786     if (__kmp_nth > __kmp_avail_proc) {
6787       __kmp_zero_bt = TRUE;
6788     }
6789   }
6790 #endif /* KMP_ADJUST_BLOCKTIME */
6791 
6792   /* we have finished middle initialization */
6793   TCW_SYNC_4(__kmp_init_middle, TRUE);
6794 
6795   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6796 }
6797 
6798 void __kmp_middle_initialize(void) {
6799   if (__kmp_init_middle) {
6800     return;
6801   }
6802   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6803   if (__kmp_init_middle) {
6804     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6805     return;
6806   }
6807   __kmp_do_middle_initialize();
6808   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6809 }
6810 
6811 void __kmp_parallel_initialize(void) {
6812   int gtid = __kmp_entry_gtid(); // this might be a new root
6813 
6814   /* synchronize parallel initialization (for sibling) */
6815   if (TCR_4(__kmp_init_parallel))
6816     return;
6817   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6818   if (TCR_4(__kmp_init_parallel)) {
6819     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6820     return;
6821   }
6822 
6823   /* TODO reinitialization after we have already shut down */
6824   if (TCR_4(__kmp_global.g.g_done)) {
6825     KA_TRACE(
6826         10,
6827         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6828     __kmp_infinite_loop();
6829   }
6830 
6831   /* jc: The lock __kmp_initz_lock is already held, so calling
6832      __kmp_serial_initialize would cause a deadlock.  So we call
6833      __kmp_do_serial_initialize directly. */
6834   if (!__kmp_init_middle) {
6835     __kmp_do_middle_initialize();
6836   }
6837 
6838   /* begin initialization */
6839   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6840   KMP_ASSERT(KMP_UBER_GTID(gtid));
6841 
6842 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6843   // Save the FP control regs.
6844   // Worker threads will set theirs to these values at thread startup.
6845   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6846   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6847   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6848 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6849 
6850 #if KMP_OS_UNIX
6851 #if KMP_HANDLE_SIGNALS
6852   /*  must be after __kmp_serial_initialize  */
6853   __kmp_install_signals(TRUE);
6854 #endif
6855 #endif
6856 
6857   __kmp_suspend_initialize();
6858 
6859 #if defined(USE_LOAD_BALANCE)
6860   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6861     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6862   }
6863 #else
6864   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6865     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6866   }
6867 #endif
6868 
6869   if (__kmp_version) {
6870     __kmp_print_version_2();
6871   }
6872 
6873   /* we have finished parallel initialization */
6874   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6875 
6876   KMP_MB();
6877   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6878 
6879   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6880 }
6881 
6882 /* ------------------------------------------------------------------------ */
6883 
6884 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6885                                    kmp_team_t *team) {
6886   kmp_disp_t *dispatch;
6887 
6888   KMP_MB();
6889 
6890   /* none of the threads have encountered any constructs, yet. */
6891   this_thr->th.th_local.this_construct = 0;
6892 #if KMP_CACHE_MANAGE
6893   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6894 #endif /* KMP_CACHE_MANAGE */
6895   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6896   KMP_DEBUG_ASSERT(dispatch);
6897   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6898   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6899   // this_thr->th.th_info.ds.ds_tid ] );
6900 
6901   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6902 #if OMP_45_ENABLED
6903   dispatch->th_doacross_buf_idx =
6904       0; /* reset the doacross dispatch buffer counter */
6905 #endif
6906   if (__kmp_env_consistency_check)
6907     __kmp_push_parallel(gtid, team->t.t_ident);
6908 
6909   KMP_MB(); /* Flush all pending memory write invalidates.  */
6910 }
6911 
6912 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6913                                   kmp_team_t *team) {
6914   if (__kmp_env_consistency_check)
6915     __kmp_pop_parallel(gtid, team->t.t_ident);
6916 
6917   __kmp_finish_implicit_task(this_thr);
6918 }
6919 
6920 int __kmp_invoke_task_func(int gtid) {
6921   int rc;
6922   int tid = __kmp_tid_from_gtid(gtid);
6923   kmp_info_t *this_thr = __kmp_threads[gtid];
6924   kmp_team_t *team = this_thr->th.th_team;
6925 
6926   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6927 #if USE_ITT_BUILD
6928   if (__itt_stack_caller_create_ptr) {
6929     __kmp_itt_stack_callee_enter(
6930         (__itt_caller)
6931             team->t.t_stack_id); // inform ittnotify about entering user's code
6932   }
6933 #endif /* USE_ITT_BUILD */
6934 #if INCLUDE_SSC_MARKS
6935   SSC_MARK_INVOKING();
6936 #endif
6937 
6938 #if OMPT_SUPPORT
6939   void *dummy;
6940   void **exit_runtime_p;
6941   ompt_data_t *my_task_data;
6942   ompt_data_t *my_parallel_data;
6943   int ompt_team_size;
6944 
6945   if (ompt_enabled.enabled) {
6946     exit_runtime_p = &(
6947         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6948   } else {
6949     exit_runtime_p = &dummy;
6950   }
6951 
6952   my_task_data =
6953       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6954   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6955   if (ompt_enabled.ompt_callback_implicit_task) {
6956     ompt_team_size = team->t.t_nproc;
6957     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6958         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6959         __kmp_tid_from_gtid(gtid));
6960   }
6961 #endif
6962 
6963   {
6964     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6965     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6966     rc =
6967         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6968                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6969 #if OMPT_SUPPORT
6970                                ,
6971                                exit_runtime_p
6972 #endif
6973                                );
6974 #if OMPT_SUPPORT
6975     *exit_runtime_p = NULL;
6976 #endif
6977   }
6978 
6979 #if USE_ITT_BUILD
6980   if (__itt_stack_caller_create_ptr) {
6981     __kmp_itt_stack_callee_leave(
6982         (__itt_caller)
6983             team->t.t_stack_id); // inform ittnotify about leaving user's code
6984   }
6985 #endif /* USE_ITT_BUILD */
6986   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6987 
6988   return rc;
6989 }
6990 
6991 #if OMP_40_ENABLED
6992 void __kmp_teams_master(int gtid) {
6993   // This routine is called by all master threads in teams construct
6994   kmp_info_t *thr = __kmp_threads[gtid];
6995   kmp_team_t *team = thr->th.th_team;
6996   ident_t *loc = team->t.t_ident;
6997   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6998   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6999   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7000   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7001                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7002 // Launch league of teams now, but not let workers execute
7003 // (they hang on fork barrier until next parallel)
7004 #if INCLUDE_SSC_MARKS
7005   SSC_MARK_FORKING();
7006 #endif
7007   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7008                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7009                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7010 #if INCLUDE_SSC_MARKS
7011   SSC_MARK_JOINING();
7012 #endif
7013 
7014   // AC: last parameter "1" eliminates join barrier which won't work because
7015   // worker threads are in a fork barrier waiting for more parallel regions
7016   __kmp_join_call(loc, gtid
7017 #if OMPT_SUPPORT
7018                   ,
7019                   fork_context_intel
7020 #endif
7021                   ,
7022                   1);
7023 }
7024 
7025 int __kmp_invoke_teams_master(int gtid) {
7026   kmp_info_t *this_thr = __kmp_threads[gtid];
7027   kmp_team_t *team = this_thr->th.th_team;
7028 #if KMP_DEBUG
7029   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7030     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7031                      (void *)__kmp_teams_master);
7032 #endif
7033   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7034   __kmp_teams_master(gtid);
7035   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7036   return 1;
7037 }
7038 #endif /* OMP_40_ENABLED */
7039 
7040 /* this sets the requested number of threads for the next parallel region
7041    encountered by this team. since this should be enclosed in the forkjoin
7042    critical section it should avoid race conditions with assymmetrical nested
7043    parallelism */
7044 
7045 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7046   kmp_info_t *thr = __kmp_threads[gtid];
7047 
7048   if (num_threads > 0)
7049     thr->th.th_set_nproc = num_threads;
7050 }
7051 
7052 #if OMP_40_ENABLED
7053 
7054 /* this sets the requested number of teams for the teams region and/or
7055    the number of threads for the next parallel region encountered  */
7056 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7057                           int num_threads) {
7058   kmp_info_t *thr = __kmp_threads[gtid];
7059   KMP_DEBUG_ASSERT(num_teams >= 0);
7060   KMP_DEBUG_ASSERT(num_threads >= 0);
7061 
7062   if (num_teams == 0)
7063     num_teams = 1; // default number of teams is 1.
7064   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7065     if (!__kmp_reserve_warn) {
7066       __kmp_reserve_warn = 1;
7067       __kmp_msg(kmp_ms_warning,
7068                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7069                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7070     }
7071     num_teams = __kmp_teams_max_nth;
7072   }
7073   // Set number of teams (number of threads in the outer "parallel" of the
7074   // teams)
7075   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7076 
7077   // Remember the number of threads for inner parallel regions
7078   if (num_threads == 0) {
7079     if (!TCR_4(__kmp_init_middle))
7080       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7081     num_threads = __kmp_avail_proc / num_teams;
7082     if (num_teams * num_threads > __kmp_teams_max_nth) {
7083       // adjust num_threads w/o warning as it is not user setting
7084       num_threads = __kmp_teams_max_nth / num_teams;
7085     }
7086   } else {
7087     if (num_teams * num_threads > __kmp_teams_max_nth) {
7088       int new_threads = __kmp_teams_max_nth / num_teams;
7089       if (!__kmp_reserve_warn) { // user asked for too many threads
7090         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7091         __kmp_msg(kmp_ms_warning,
7092                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7093                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7094       }
7095       num_threads = new_threads;
7096     }
7097   }
7098   thr->th.th_teams_size.nth = num_threads;
7099 }
7100 
7101 // Set the proc_bind var to use in the following parallel region.
7102 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7103   kmp_info_t *thr = __kmp_threads[gtid];
7104   thr->th.th_set_proc_bind = proc_bind;
7105 }
7106 
7107 #endif /* OMP_40_ENABLED */
7108 
7109 /* Launch the worker threads into the microtask. */
7110 
7111 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7112   kmp_info_t *this_thr = __kmp_threads[gtid];
7113 
7114 #ifdef KMP_DEBUG
7115   int f;
7116 #endif /* KMP_DEBUG */
7117 
7118   KMP_DEBUG_ASSERT(team);
7119   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7120   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7121   KMP_MB(); /* Flush all pending memory write invalidates.  */
7122 
7123   team->t.t_construct = 0; /* no single directives seen yet */
7124   team->t.t_ordered.dt.t_value =
7125       0; /* thread 0 enters the ordered section first */
7126 
7127   /* Reset the identifiers on the dispatch buffer */
7128   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7129   if (team->t.t_max_nproc > 1) {
7130     int i;
7131     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7132       team->t.t_disp_buffer[i].buffer_index = i;
7133 #if OMP_45_ENABLED
7134       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7135 #endif
7136     }
7137   } else {
7138     team->t.t_disp_buffer[0].buffer_index = 0;
7139 #if OMP_45_ENABLED
7140     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7141 #endif
7142   }
7143 
7144   KMP_MB(); /* Flush all pending memory write invalidates.  */
7145   KMP_ASSERT(this_thr->th.th_team == team);
7146 
7147 #ifdef KMP_DEBUG
7148   for (f = 0; f < team->t.t_nproc; f++) {
7149     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7150                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7151   }
7152 #endif /* KMP_DEBUG */
7153 
7154   /* release the worker threads so they may begin working */
7155   __kmp_fork_barrier(gtid, 0);
7156 }
7157 
7158 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7159   kmp_info_t *this_thr = __kmp_threads[gtid];
7160 
7161   KMP_DEBUG_ASSERT(team);
7162   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7163   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7164   KMP_MB(); /* Flush all pending memory write invalidates.  */
7165 
7166 /* Join barrier after fork */
7167 
7168 #ifdef KMP_DEBUG
7169   if (__kmp_threads[gtid] &&
7170       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7171     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7172                  __kmp_threads[gtid]);
7173     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7174                  "team->t.t_nproc=%d\n",
7175                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7176                  team->t.t_nproc);
7177     __kmp_print_structure();
7178   }
7179   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7180                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7181 #endif /* KMP_DEBUG */
7182 
7183   __kmp_join_barrier(gtid); /* wait for everyone */
7184 #if OMPT_SUPPORT
7185   int ds_tid = this_thr->th.th_info.ds.ds_tid;
7186   if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7187     ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr);
7188     ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr);
7189     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7190 #if OMPT_OPTIONAL
7191     void *codeptr = NULL;
7192     if (KMP_MASTER_TID(ds_tid) &&
7193         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7194          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7195       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7196 
7197     if (ompt_enabled.ompt_callback_sync_region_wait) {
7198       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7199           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7200     }
7201     if (ompt_enabled.ompt_callback_sync_region) {
7202       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7203           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7204     }
7205 #endif
7206     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7207       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7208           ompt_scope_end, NULL, tId, 0, ds_tid);
7209     }
7210     // return to idle state
7211     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7212   }
7213 #endif
7214 
7215   KMP_MB(); /* Flush all pending memory write invalidates.  */
7216   KMP_ASSERT(this_thr->th.th_team == team);
7217 }
7218 
7219 /* ------------------------------------------------------------------------ */
7220 
7221 #ifdef USE_LOAD_BALANCE
7222 
7223 // Return the worker threads actively spinning in the hot team, if we
7224 // are at the outermost level of parallelism.  Otherwise, return 0.
7225 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7226   int i;
7227   int retval;
7228   kmp_team_t *hot_team;
7229 
7230   if (root->r.r_active) {
7231     return 0;
7232   }
7233   hot_team = root->r.r_hot_team;
7234   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7235     return hot_team->t.t_nproc - 1; // Don't count master thread
7236   }
7237 
7238   // Skip the master thread - it is accounted for elsewhere.
7239   retval = 0;
7240   for (i = 1; i < hot_team->t.t_nproc; i++) {
7241     if (hot_team->t.t_threads[i]->th.th_active) {
7242       retval++;
7243     }
7244   }
7245   return retval;
7246 }
7247 
7248 // Perform an automatic adjustment to the number of
7249 // threads used by the next parallel region.
7250 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7251   int retval;
7252   int pool_active;
7253   int hot_team_active;
7254   int team_curr_active;
7255   int system_active;
7256 
7257   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7258                 set_nproc));
7259   KMP_DEBUG_ASSERT(root);
7260   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7261                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7262   KMP_DEBUG_ASSERT(set_nproc > 1);
7263 
7264   if (set_nproc == 1) {
7265     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7266     return 1;
7267   }
7268 
7269   // Threads that are active in the thread pool, active in the hot team for this
7270   // particular root (if we are at the outer par level), and the currently
7271   // executing thread (to become the master) are available to add to the new
7272   // team, but are currently contributing to the system load, and must be
7273   // accounted for.
7274   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7275   hot_team_active = __kmp_active_hot_team_nproc(root);
7276   team_curr_active = pool_active + hot_team_active + 1;
7277 
7278   // Check the system load.
7279   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7280   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7281                 "hot team active = %d\n",
7282                 system_active, pool_active, hot_team_active));
7283 
7284   if (system_active < 0) {
7285     // There was an error reading the necessary info from /proc, so use the
7286     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7287     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7288     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7289     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7290 
7291     // Make this call behave like the thread limit algorithm.
7292     retval = __kmp_avail_proc - __kmp_nth +
7293              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7294     if (retval > set_nproc) {
7295       retval = set_nproc;
7296     }
7297     if (retval < KMP_MIN_NTH) {
7298       retval = KMP_MIN_NTH;
7299     }
7300 
7301     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7302                   retval));
7303     return retval;
7304   }
7305 
7306   // There is a slight delay in the load balance algorithm in detecting new
7307   // running procs. The real system load at this instant should be at least as
7308   // large as the #active omp thread that are available to add to the team.
7309   if (system_active < team_curr_active) {
7310     system_active = team_curr_active;
7311   }
7312   retval = __kmp_avail_proc - system_active + team_curr_active;
7313   if (retval > set_nproc) {
7314     retval = set_nproc;
7315   }
7316   if (retval < KMP_MIN_NTH) {
7317     retval = KMP_MIN_NTH;
7318   }
7319 
7320   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7321   return retval;
7322 } // __kmp_load_balance_nproc()
7323 
7324 #endif /* USE_LOAD_BALANCE */
7325 
7326 /* ------------------------------------------------------------------------ */
7327 
7328 /* NOTE: this is called with the __kmp_init_lock held */
7329 void __kmp_cleanup(void) {
7330   int f;
7331 
7332   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7333 
7334   if (TCR_4(__kmp_init_parallel)) {
7335 #if KMP_HANDLE_SIGNALS
7336     __kmp_remove_signals();
7337 #endif
7338     TCW_4(__kmp_init_parallel, FALSE);
7339   }
7340 
7341   if (TCR_4(__kmp_init_middle)) {
7342 #if KMP_AFFINITY_SUPPORTED
7343     __kmp_affinity_uninitialize();
7344 #endif /* KMP_AFFINITY_SUPPORTED */
7345     __kmp_cleanup_hierarchy();
7346     TCW_4(__kmp_init_middle, FALSE);
7347   }
7348 
7349   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7350 
7351   if (__kmp_init_serial) {
7352     __kmp_runtime_destroy();
7353     __kmp_init_serial = FALSE;
7354   }
7355 
7356   for (f = 0; f < __kmp_threads_capacity; f++) {
7357     if (__kmp_root[f] != NULL) {
7358       __kmp_free(__kmp_root[f]);
7359       __kmp_root[f] = NULL;
7360     }
7361   }
7362   __kmp_free(__kmp_threads);
7363   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7364   // there is no need in freeing __kmp_root.
7365   __kmp_threads = NULL;
7366   __kmp_root = NULL;
7367   __kmp_threads_capacity = 0;
7368 
7369 #if KMP_USE_DYNAMIC_LOCK
7370   __kmp_cleanup_indirect_user_locks();
7371 #else
7372   __kmp_cleanup_user_locks();
7373 #endif
7374 
7375 #if KMP_AFFINITY_SUPPORTED
7376   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7377   __kmp_cpuinfo_file = NULL;
7378 #endif /* KMP_AFFINITY_SUPPORTED */
7379 
7380 #if KMP_USE_ADAPTIVE_LOCKS
7381 #if KMP_DEBUG_ADAPTIVE_LOCKS
7382   __kmp_print_speculative_stats();
7383 #endif
7384 #endif
7385   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7386   __kmp_nested_nth.nth = NULL;
7387   __kmp_nested_nth.size = 0;
7388   __kmp_nested_nth.used = 0;
7389   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7390   __kmp_nested_proc_bind.bind_types = NULL;
7391   __kmp_nested_proc_bind.size = 0;
7392   __kmp_nested_proc_bind.used = 0;
7393 
7394   __kmp_i18n_catclose();
7395 
7396 #if KMP_STATS_ENABLED
7397   __kmp_stats_fini();
7398 #endif
7399 
7400   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7401 }
7402 
7403 /* ------------------------------------------------------------------------ */
7404 
7405 int __kmp_ignore_mppbeg(void) {
7406   char *env;
7407 
7408   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7409     if (__kmp_str_match_false(env))
7410       return FALSE;
7411   }
7412   // By default __kmpc_begin() is no-op.
7413   return TRUE;
7414 }
7415 
7416 int __kmp_ignore_mppend(void) {
7417   char *env;
7418 
7419   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7420     if (__kmp_str_match_false(env))
7421       return FALSE;
7422   }
7423   // By default __kmpc_end() is no-op.
7424   return TRUE;
7425 }
7426 
7427 void __kmp_internal_begin(void) {
7428   int gtid;
7429   kmp_root_t *root;
7430 
7431   /* this is a very important step as it will register new sibling threads
7432      and assign these new uber threads a new gtid */
7433   gtid = __kmp_entry_gtid();
7434   root = __kmp_threads[gtid]->th.th_root;
7435   KMP_ASSERT(KMP_UBER_GTID(gtid));
7436 
7437   if (root->r.r_begin)
7438     return;
7439   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7440   if (root->r.r_begin) {
7441     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7442     return;
7443   }
7444 
7445   root->r.r_begin = TRUE;
7446 
7447   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7448 }
7449 
7450 /* ------------------------------------------------------------------------ */
7451 
7452 void __kmp_user_set_library(enum library_type arg) {
7453   int gtid;
7454   kmp_root_t *root;
7455   kmp_info_t *thread;
7456 
7457   /* first, make sure we are initialized so we can get our gtid */
7458 
7459   gtid = __kmp_entry_gtid();
7460   thread = __kmp_threads[gtid];
7461 
7462   root = thread->th.th_root;
7463 
7464   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7465                 library_serial));
7466   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7467                                   thread */
7468     KMP_WARNING(SetLibraryIncorrectCall);
7469     return;
7470   }
7471 
7472   switch (arg) {
7473   case library_serial:
7474     thread->th.th_set_nproc = 0;
7475     set__nproc(thread, 1);
7476     break;
7477   case library_turnaround:
7478     thread->th.th_set_nproc = 0;
7479     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7480                                            : __kmp_dflt_team_nth_ub);
7481     break;
7482   case library_throughput:
7483     thread->th.th_set_nproc = 0;
7484     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7485                                            : __kmp_dflt_team_nth_ub);
7486     break;
7487   default:
7488     KMP_FATAL(UnknownLibraryType, arg);
7489   }
7490 
7491   __kmp_aux_set_library(arg);
7492 }
7493 
7494 void __kmp_aux_set_stacksize(size_t arg) {
7495   if (!__kmp_init_serial)
7496     __kmp_serial_initialize();
7497 
7498 #if KMP_OS_DARWIN
7499   if (arg & (0x1000 - 1)) {
7500     arg &= ~(0x1000 - 1);
7501     if (arg + 0x1000) /* check for overflow if we round up */
7502       arg += 0x1000;
7503   }
7504 #endif
7505   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7506 
7507   /* only change the default stacksize before the first parallel region */
7508   if (!TCR_4(__kmp_init_parallel)) {
7509     size_t value = arg; /* argument is in bytes */
7510 
7511     if (value < __kmp_sys_min_stksize)
7512       value = __kmp_sys_min_stksize;
7513     else if (value > KMP_MAX_STKSIZE)
7514       value = KMP_MAX_STKSIZE;
7515 
7516     __kmp_stksize = value;
7517 
7518     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7519   }
7520 
7521   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7522 }
7523 
7524 /* set the behaviour of the runtime library */
7525 /* TODO this can cause some odd behaviour with sibling parallelism... */
7526 void __kmp_aux_set_library(enum library_type arg) {
7527   __kmp_library = arg;
7528 
7529   switch (__kmp_library) {
7530   case library_serial: {
7531     KMP_INFORM(LibraryIsSerial);
7532     (void)__kmp_change_library(TRUE);
7533   } break;
7534   case library_turnaround:
7535     (void)__kmp_change_library(TRUE);
7536     break;
7537   case library_throughput:
7538     (void)__kmp_change_library(FALSE);
7539     break;
7540   default:
7541     KMP_FATAL(UnknownLibraryType, arg);
7542   }
7543 }
7544 
7545 /* ------------------------------------------------------------------------ */
7546 
7547 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7548   int blocktime = arg; /* argument is in milliseconds */
7549 #if KMP_USE_MONITOR
7550   int bt_intervals;
7551 #endif
7552   int bt_set;
7553 
7554   __kmp_save_internal_controls(thread);
7555 
7556   /* Normalize and set blocktime for the teams */
7557   if (blocktime < KMP_MIN_BLOCKTIME)
7558     blocktime = KMP_MIN_BLOCKTIME;
7559   else if (blocktime > KMP_MAX_BLOCKTIME)
7560     blocktime = KMP_MAX_BLOCKTIME;
7561 
7562   set__blocktime_team(thread->th.th_team, tid, blocktime);
7563   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7564 
7565 #if KMP_USE_MONITOR
7566   /* Calculate and set blocktime intervals for the teams */
7567   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7568 
7569   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7570   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7571 #endif
7572 
7573   /* Set whether blocktime has been set to "TRUE" */
7574   bt_set = TRUE;
7575 
7576   set__bt_set_team(thread->th.th_team, tid, bt_set);
7577   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7578 #if KMP_USE_MONITOR
7579   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7580                 "bt_intervals=%d, monitor_updates=%d\n",
7581                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7582                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7583                 __kmp_monitor_wakeups));
7584 #else
7585   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7586                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7587                 thread->th.th_team->t.t_id, tid, blocktime));
7588 #endif
7589 }
7590 
7591 void __kmp_aux_set_defaults(char const *str, int len) {
7592   if (!__kmp_init_serial) {
7593     __kmp_serial_initialize();
7594   }
7595   __kmp_env_initialize(str);
7596 
7597   if (__kmp_settings
7598 #if OMP_40_ENABLED
7599       || __kmp_display_env || __kmp_display_env_verbose
7600 #endif // OMP_40_ENABLED
7601       ) {
7602     __kmp_env_print();
7603   }
7604 } // __kmp_aux_set_defaults
7605 
7606 /* ------------------------------------------------------------------------ */
7607 /* internal fast reduction routines */
7608 
7609 PACKED_REDUCTION_METHOD_T
7610 __kmp_determine_reduction_method(
7611     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7612     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7613     kmp_critical_name *lck) {
7614 
7615   // Default reduction method: critical construct ( lck != NULL, like in current
7616   // PAROPT )
7617   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7618   // can be selected by RTL
7619   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7620   // can be selected by RTL
7621   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7622   // among generated by PAROPT.
7623 
7624   PACKED_REDUCTION_METHOD_T retval;
7625 
7626   int team_size;
7627 
7628   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7629   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7630 
7631 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7632   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7633 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7634 
7635   retval = critical_reduce_block;
7636 
7637   // another choice of getting a team size (with 1 dynamic deference) is slower
7638   team_size = __kmp_get_team_num_threads(global_tid);
7639   if (team_size == 1) {
7640 
7641     retval = empty_reduce_block;
7642 
7643   } else {
7644 
7645     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7646     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7647 
7648 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7649 
7650 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7651     KMP_OS_DARWIN
7652 
7653     int teamsize_cutoff = 4;
7654 
7655 #if KMP_MIC_SUPPORTED
7656     if (__kmp_mic_type != non_mic) {
7657       teamsize_cutoff = 8;
7658     }
7659 #endif
7660     if (tree_available) {
7661       if (team_size <= teamsize_cutoff) {
7662         if (atomic_available) {
7663           retval = atomic_reduce_block;
7664         }
7665       } else {
7666         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7667       }
7668     } else if (atomic_available) {
7669       retval = atomic_reduce_block;
7670     }
7671 #else
7672 #error "Unknown or unsupported OS"
7673 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7674 // KMP_OS_DARWIN
7675 
7676 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7677 
7678 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7679 
7680     // basic tuning
7681 
7682     if (atomic_available) {
7683       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7684         retval = atomic_reduce_block;
7685       }
7686     } // otherwise: use critical section
7687 
7688 #elif KMP_OS_DARWIN
7689 
7690     if (atomic_available && (num_vars <= 3)) {
7691       retval = atomic_reduce_block;
7692     } else if (tree_available) {
7693       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7694           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7695         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7696       }
7697     } // otherwise: use critical section
7698 
7699 #else
7700 #error "Unknown or unsupported OS"
7701 #endif
7702 
7703 #else
7704 #error "Unknown or unsupported architecture"
7705 #endif
7706   }
7707 
7708   // KMP_FORCE_REDUCTION
7709 
7710   // If the team is serialized (team_size == 1), ignore the forced reduction
7711   // method and stay with the unsynchronized method (empty_reduce_block)
7712   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7713       team_size != 1) {
7714 
7715     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7716 
7717     int atomic_available, tree_available;
7718 
7719     switch ((forced_retval = __kmp_force_reduction_method)) {
7720     case critical_reduce_block:
7721       KMP_ASSERT(lck); // lck should be != 0
7722       break;
7723 
7724     case atomic_reduce_block:
7725       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7726       if (!atomic_available) {
7727         KMP_WARNING(RedMethodNotSupported, "atomic");
7728         forced_retval = critical_reduce_block;
7729       }
7730       break;
7731 
7732     case tree_reduce_block:
7733       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7734       if (!tree_available) {
7735         KMP_WARNING(RedMethodNotSupported, "tree");
7736         forced_retval = critical_reduce_block;
7737       } else {
7738 #if KMP_FAST_REDUCTION_BARRIER
7739         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7740 #endif
7741       }
7742       break;
7743 
7744     default:
7745       KMP_ASSERT(0); // "unsupported method specified"
7746     }
7747 
7748     retval = forced_retval;
7749   }
7750 
7751   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7752 
7753 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7754 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7755 
7756   return (retval);
7757 }
7758 
7759 // this function is for testing set/get/determine reduce method
7760 kmp_int32 __kmp_get_reduce_method(void) {
7761   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7762 }
7763