1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376     ompt_lw_taskteam_t lw_taskteam;
1377     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378                             &ompt_parallel_data, codeptr);
1379 
1380     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381     // don't use lw_taskteam after linking. content was swaped
1382 
1383     /* OMPT implicit task begin */
1384     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385     if (ompt_enabled.ompt_callback_implicit_task) {
1386       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389     }
1390 
1391     /* OMPT state */
1392     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1393     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1394   }
1395 #endif
1396 }
1397 
1398 /* most of the work for a fork */
1399 /* return true if we really went parallel, false if serialized */
1400 int __kmp_fork_call(ident_t *loc, int gtid,
1401                     enum fork_context_e call_context, // Intel, GNU, ...
1402                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1403 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1404 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1405                     va_list *ap
1406 #else
1407                     va_list ap
1408 #endif
1409                     ) {
1410   void **argv;
1411   int i;
1412   int master_tid;
1413   int master_this_cons;
1414   kmp_team_t *team;
1415   kmp_team_t *parent_team;
1416   kmp_info_t *master_th;
1417   kmp_root_t *root;
1418   int nthreads;
1419   int master_active;
1420   int master_set_numthreads;
1421   int level;
1422 #if OMP_40_ENABLED
1423   int active_level;
1424   int teams_level;
1425 #endif
1426 #if KMP_NESTED_HOT_TEAMS
1427   kmp_hot_team_ptr_t **p_hot_teams;
1428 #endif
1429   { // KMP_TIME_BLOCK
1430     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1431     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1432 
1433     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1434     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1435       /* Some systems prefer the stack for the root thread(s) to start with */
1436       /* some gap from the parent stack to prevent false sharing. */
1437       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1438       /* These 2 lines below are so this does not get optimized out */
1439       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1440         __kmp_stkpadding += (short)((kmp_int64)dummy);
1441     }
1442 
1443     /* initialize if needed */
1444     KMP_DEBUG_ASSERT(
1445         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1446     if (!TCR_4(__kmp_init_parallel))
1447       __kmp_parallel_initialize();
1448 
1449     /* setup current data */
1450     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1451     // shutdown
1452     parent_team = master_th->th.th_team;
1453     master_tid = master_th->th.th_info.ds.ds_tid;
1454     master_this_cons = master_th->th.th_local.this_construct;
1455     root = master_th->th.th_root;
1456     master_active = root->r.r_active;
1457     master_set_numthreads = master_th->th.th_set_nproc;
1458 
1459 #if OMPT_SUPPORT
1460     ompt_data_t ompt_parallel_data;
1461     ompt_parallel_data.ptr = NULL;
1462     ompt_data_t *parent_task_data;
1463     ompt_frame_t *ompt_frame;
1464     ompt_data_t *implicit_task_data;
1465     void *return_address = NULL;
1466 
1467     if (ompt_enabled.enabled) {
1468       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1469                                     NULL, NULL);
1470       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1471     }
1472 #endif
1473 
1474     // Nested level will be an index in the nested nthreads array
1475     level = parent_team->t.t_level;
1476     // used to launch non-serial teams even if nested is not allowed
1477     active_level = parent_team->t.t_active_level;
1478 #if OMP_40_ENABLED
1479     // needed to check nesting inside the teams
1480     teams_level = master_th->th.th_teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483     p_hot_teams = &master_th->th.th_hot_teams;
1484     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1485       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1486           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1487       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1488       // it is either actual or not needed (when active_level > 0)
1489       (*p_hot_teams)[0].hot_team_nth = 1;
1490     }
1491 #endif
1492 
1493 #if OMPT_SUPPORT
1494     if (ompt_enabled.enabled) {
1495       if (ompt_enabled.ompt_callback_parallel_begin) {
1496         int team_size = master_set_numthreads
1497                             ? master_set_numthreads
1498                             : get__nproc_2(parent_team, master_tid);
1499         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1500             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1501             OMPT_INVOKER(call_context), return_address);
1502       }
1503       master_th->th.ompt_thread_info.state = omp_state_overhead;
1504     }
1505 #endif
1506 
1507     master_th->th.th_ident = loc;
1508 
1509 #if OMP_40_ENABLED
1510     if (master_th->th.th_teams_microtask && ap &&
1511         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1512       // AC: This is start of parallel that is nested inside teams construct.
1513       // The team is actual (hot), all workers are ready at the fork barrier.
1514       // No lock needed to initialize the team a bit, then free workers.
1515       parent_team->t.t_ident = loc;
1516       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1517       parent_team->t.t_argc = argc;
1518       argv = (void **)parent_team->t.t_argv;
1519       for (i = argc - 1; i >= 0; --i)
1520 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1521 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1522         *argv++ = va_arg(*ap, void *);
1523 #else
1524         *argv++ = va_arg(ap, void *);
1525 #endif
1526       // Increment our nested depth levels, but not increase the serialization
1527       if (parent_team == master_th->th.th_serial_team) {
1528         // AC: we are in serialized parallel
1529         __kmpc_serialized_parallel(loc, gtid);
1530         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1531         // AC: need this in order enquiry functions work
1532         // correctly, will restore at join time
1533         parent_team->t.t_serialized--;
1534 #if OMPT_SUPPORT
1535         void *dummy;
1536         void **exit_runtime_p;
1537 
1538         ompt_lw_taskteam_t lw_taskteam;
1539 
1540         if (ompt_enabled.enabled) {
1541           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1542                                   &ompt_parallel_data, return_address);
1543           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1544 
1545           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1546           // don't use lw_taskteam after linking. content was swaped
1547 
1548           /* OMPT implicit task begin */
1549           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1550           if (ompt_enabled.ompt_callback_implicit_task) {
1551             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1552                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1553                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1554           }
1555 
1556           /* OMPT state */
1557           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1558         } else {
1559           exit_runtime_p = &dummy;
1560         }
1561 #endif
1562 
1563         {
1564           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1565           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1566           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1567 #if OMPT_SUPPORT
1568                                  ,
1569                                  exit_runtime_p
1570 #endif
1571                                  );
1572         }
1573 
1574 #if OMPT_SUPPORT
1575         *exit_runtime_p = NULL;
1576         if (ompt_enabled.enabled) {
1577           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1578           if (ompt_enabled.ompt_callback_implicit_task) {
1579             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1580                 ompt_scope_end, NULL, implicit_task_data, 1,
1581                 __kmp_tid_from_gtid(gtid));
1582           }
1583           __ompt_lw_taskteam_unlink(master_th);
1584 
1585           if (ompt_enabled.ompt_callback_parallel_end) {
1586             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1587                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1588                 OMPT_INVOKER(call_context), return_address);
1589           }
1590           master_th->th.ompt_thread_info.state = omp_state_overhead;
1591         }
1592 #endif
1593         return TRUE;
1594       }
1595 
1596       parent_team->t.t_pkfn = microtask;
1597       parent_team->t.t_invoke = invoker;
1598       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1599       parent_team->t.t_active_level++;
1600       parent_team->t.t_level++;
1601 
1602       /* Change number of threads in the team if requested */
1603       if (master_set_numthreads) { // The parallel has num_threads clause
1604         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1605           // AC: only can reduce number of threads dynamically, can't increase
1606           kmp_info_t **other_threads = parent_team->t.t_threads;
1607           parent_team->t.t_nproc = master_set_numthreads;
1608           for (i = 0; i < master_set_numthreads; ++i) {
1609             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1610           }
1611           // Keep extra threads hot in the team for possible next parallels
1612         }
1613         master_th->th.th_set_nproc = 0;
1614       }
1615 
1616 #if USE_DEBUGGER
1617       if (__kmp_debugging) { // Let debugger override number of threads.
1618         int nth = __kmp_omp_num_threads(loc);
1619         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1620           master_set_numthreads = nth;
1621         }
1622       }
1623 #endif
1624 
1625       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1626                     "master_th=%p, gtid=%d\n",
1627                     root, parent_team, master_th, gtid));
1628       __kmp_internal_fork(loc, gtid, parent_team);
1629       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1630                     "master_th=%p, gtid=%d\n",
1631                     root, parent_team, master_th, gtid));
1632 
1633       /* Invoke microtask for MASTER thread */
1634       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1635                     parent_team->t.t_id, parent_team->t.t_pkfn));
1636 
1637       {
1638         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1639         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1640         if (!parent_team->t.t_invoke(gtid)) {
1641           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642         }
1643       }
1644       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1645                     parent_team->t.t_id, parent_team->t.t_pkfn));
1646       KMP_MB(); /* Flush all pending memory write invalidates.  */
1647 
1648       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1649 
1650       return TRUE;
1651     } // Parallel closely nested in teams construct
1652 #endif /* OMP_40_ENABLED */
1653 
1654 #if KMP_DEBUG
1655     if (__kmp_tasking_mode != tskm_immediate_exec) {
1656       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1657                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1658     }
1659 #endif
1660 
1661     if (parent_team->t.t_active_level >=
1662         master_th->th.th_current_task->td_icvs.max_active_levels) {
1663       nthreads = 1;
1664     } else {
1665 #if OMP_40_ENABLED
1666       int enter_teams = ((ap == NULL && active_level == 0) ||
1667                          (ap && teams_level > 0 && teams_level == level));
1668 #endif
1669       nthreads =
1670           master_set_numthreads
1671               ? master_set_numthreads
1672               : get__nproc_2(
1673                     parent_team,
1674                     master_tid); // TODO: get nproc directly from current task
1675 
1676       // Check if we need to take forkjoin lock? (no need for serialized
1677       // parallel out of teams construct). This code moved here from
1678       // __kmp_reserve_threads() to speedup nested serialized parallels.
1679       if (nthreads > 1) {
1680         if ((!get__nested(master_th) && (root->r.r_in_parallel
1681 #if OMP_40_ENABLED
1682                                          && !enter_teams
1683 #endif /* OMP_40_ENABLED */
1684                                          )) ||
1685             (__kmp_library == library_serial)) {
1686           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1687                         " threads\n",
1688                         gtid, nthreads));
1689           nthreads = 1;
1690         }
1691       }
1692       if (nthreads > 1) {
1693         /* determine how many new threads we can use */
1694         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1695         nthreads = __kmp_reserve_threads(
1696             root, parent_team, master_tid, nthreads
1697 #if OMP_40_ENABLED
1698             /* AC: If we execute teams from parallel region (on host), then
1699                teams should be created but each can only have 1 thread if
1700                nesting is disabled. If teams called from serial region, then
1701                teams and their threads should be created regardless of the
1702                nesting setting. */
1703             ,
1704             enter_teams
1705 #endif /* OMP_40_ENABLED */
1706             );
1707         if (nthreads == 1) {
1708           // Free lock for single thread execution here; for multi-thread
1709           // execution it will be freed later after team of threads created
1710           // and initialized
1711           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1712         }
1713       }
1714     }
1715     KMP_DEBUG_ASSERT(nthreads > 0);
1716 
1717     // If we temporarily changed the set number of threads then restore it now
1718     master_th->th.th_set_nproc = 0;
1719 
1720     /* create a serialized parallel region? */
1721     if (nthreads == 1) {
1722 /* josh todo: hypothetical question: what do we do for OS X*? */
1723 #if KMP_OS_LINUX &&                                                            \
1724     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1725       void *args[argc];
1726 #else
1727       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1728 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1729           KMP_ARCH_AARCH64) */
1730 
1731       KA_TRACE(20,
1732                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1733 
1734       __kmpc_serialized_parallel(loc, gtid);
1735 
1736       if (call_context == fork_context_intel) {
1737         /* TODO this sucks, use the compiler itself to pass args! :) */
1738         master_th->th.th_serial_team->t.t_ident = loc;
1739 #if OMP_40_ENABLED
1740         if (!ap) {
1741           // revert change made in __kmpc_serialized_parallel()
1742           master_th->th.th_serial_team->t.t_level--;
1743 // Get args from parent team for teams construct
1744 
1745 #if OMPT_SUPPORT
1746           void *dummy;
1747           void **exit_runtime_p;
1748           ompt_task_info_t *task_info;
1749 
1750           ompt_lw_taskteam_t lw_taskteam;
1751 
1752           if (ompt_enabled.enabled) {
1753             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1754                                     &ompt_parallel_data, return_address);
1755 
1756             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1757             // don't use lw_taskteam after linking. content was swaped
1758 
1759             task_info = OMPT_CUR_TASK_INFO(master_th);
1760             exit_runtime_p = &(task_info->frame.exit_frame);
1761             if (ompt_enabled.ompt_callback_implicit_task) {
1762               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1764                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1765             }
1766 
1767             /* OMPT state */
1768             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1769           } else {
1770             exit_runtime_p = &dummy;
1771           }
1772 #endif
1773 
1774           {
1775             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1776             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1777             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1778                                    parent_team->t.t_argv
1779 #if OMPT_SUPPORT
1780                                    ,
1781                                    exit_runtime_p
1782 #endif
1783                                    );
1784           }
1785 
1786 #if OMPT_SUPPORT
1787           if (ompt_enabled.enabled) {
1788             exit_runtime_p = NULL;
1789             if (ompt_enabled.ompt_callback_implicit_task) {
1790               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1792                   __kmp_tid_from_gtid(gtid));
1793             }
1794 
1795             __ompt_lw_taskteam_unlink(master_th);
1796             if (ompt_enabled.ompt_callback_parallel_end) {
1797               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1798                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1799                   OMPT_INVOKER(call_context), return_address);
1800             }
1801             master_th->th.ompt_thread_info.state = omp_state_overhead;
1802           }
1803 #endif
1804         } else if (microtask == (microtask_t)__kmp_teams_master) {
1805           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1806                            master_th->th.th_serial_team);
1807           team = master_th->th.th_team;
1808           // team->t.t_pkfn = microtask;
1809           team->t.t_invoke = invoker;
1810           __kmp_alloc_argv_entries(argc, team, TRUE);
1811           team->t.t_argc = argc;
1812           argv = (void **)team->t.t_argv;
1813           if (ap) {
1814             for (i = argc - 1; i >= 0; --i)
1815 // TODO: revert workaround for Intel(R) 64 tracker #96
1816 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1817               *argv++ = va_arg(*ap, void *);
1818 #else
1819               *argv++ = va_arg(ap, void *);
1820 #endif
1821           } else {
1822             for (i = 0; i < argc; ++i)
1823               // Get args from parent team for teams construct
1824               argv[i] = parent_team->t.t_argv[i];
1825           }
1826           // AC: revert change made in __kmpc_serialized_parallel()
1827           //     because initial code in teams should have level=0
1828           team->t.t_level--;
1829           // AC: call special invoker for outer "parallel" of teams construct
1830           {
1831             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1832             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1833             invoker(gtid);
1834           }
1835         } else {
1836 #endif /* OMP_40_ENABLED */
1837           argv = args;
1838           for (i = argc - 1; i >= 0; --i)
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841             *argv++ = va_arg(*ap, void *);
1842 #else
1843           *argv++ = va_arg(ap, void *);
1844 #endif
1845           KMP_MB();
1846 
1847 #if OMPT_SUPPORT
1848           void *dummy;
1849           void **exit_runtime_p;
1850           ompt_task_info_t *task_info;
1851 
1852           ompt_lw_taskteam_t lw_taskteam;
1853 
1854           if (ompt_enabled.enabled) {
1855             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1856                                     &ompt_parallel_data, return_address);
1857             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1858             // don't use lw_taskteam after linking. content was swaped
1859             task_info = OMPT_CUR_TASK_INFO(master_th);
1860             exit_runtime_p = &(task_info->frame.exit_frame);
1861 
1862             /* OMPT implicit task begin */
1863             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1864             if (ompt_enabled.ompt_callback_implicit_task) {
1865               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1866                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1867                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1868             }
1869 
1870             /* OMPT state */
1871             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1872           } else {
1873             exit_runtime_p = &dummy;
1874           }
1875 #endif
1876 
1877           {
1878             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882                                    ,
1883                                    exit_runtime_p
1884 #endif
1885                                    );
1886           }
1887 
1888 #if OMPT_SUPPORT
1889           if (ompt_enabled.enabled) {
1890             *exit_runtime_p = NULL;
1891             if (ompt_enabled.ompt_callback_implicit_task) {
1892               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1894                   __kmp_tid_from_gtid(gtid));
1895             }
1896 
1897             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898             __ompt_lw_taskteam_unlink(master_th);
1899             if (ompt_enabled.ompt_callback_parallel_end) {
1900               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901                   &ompt_parallel_data, parent_task_data,
1902                   OMPT_INVOKER(call_context), return_address);
1903             }
1904             master_th->th.ompt_thread_info.state = omp_state_overhead;
1905           }
1906 #endif
1907 #if OMP_40_ENABLED
1908         }
1909 #endif /* OMP_40_ENABLED */
1910       } else if (call_context == fork_context_gnu) {
1911 #if OMPT_SUPPORT
1912         ompt_lw_taskteam_t lwt;
1913         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1914                                 return_address);
1915 
1916         lwt.ompt_task_info.frame.exit_frame = NULL;
1917         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921         // we were called from GNU native code
1922         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1923         return FALSE;
1924       } else {
1925         KMP_ASSERT2(call_context < fork_context_last,
1926                     "__kmp_fork_call: unknown fork_context parameter");
1927       }
1928 
1929       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930       KMP_MB();
1931       return FALSE;
1932     }
1933 
1934     // GEH: only modify the executing flag in the case when not serialized
1935     //      serialized case is handled in kmpc_serialized_parallel
1936     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1937                   "curtask=%p, curtask_max_aclevel=%d\n",
1938                   parent_team->t.t_active_level, master_th,
1939                   master_th->th.th_current_task,
1940                   master_th->th.th_current_task->td_icvs.max_active_levels));
1941     // TODO: GEH - cannot do this assertion because root thread not set up as
1942     // executing
1943     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1944     master_th->th.th_current_task->td_flags.executing = 0;
1945 
1946 #if OMP_40_ENABLED
1947     if (!master_th->th.th_teams_microtask || level > teams_level)
1948 #endif /* OMP_40_ENABLED */
1949     {
1950       /* Increment our nested depth level */
1951       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1952     }
1953 
1954     // See if we need to make a copy of the ICVs.
1955     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1956     if ((level + 1 < __kmp_nested_nth.used) &&
1957         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1958       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1959     } else {
1960       nthreads_icv = 0; // don't update
1961     }
1962 
1963 #if OMP_40_ENABLED
1964     // Figure out the proc_bind_policy for the new team.
1965     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1966     kmp_proc_bind_t proc_bind_icv =
1967         proc_bind_default; // proc_bind_default means don't update
1968     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1969       proc_bind = proc_bind_false;
1970     } else {
1971       if (proc_bind == proc_bind_default) {
1972         // No proc_bind clause specified; use current proc-bind-var for this
1973         // parallel region
1974         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1975       }
1976       /* else: The proc_bind policy was specified explicitly on parallel clause.
1977          This overrides proc-bind-var for this parallel region, but does not
1978          change proc-bind-var. */
1979       // Figure the value of proc-bind-var for the child threads.
1980       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1981           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1982            master_th->th.th_current_task->td_icvs.proc_bind)) {
1983         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1984       }
1985     }
1986 
1987     // Reset for next parallel region
1988     master_th->th.th_set_proc_bind = proc_bind_default;
1989 #endif /* OMP_40_ENABLED */
1990 
1991     if ((nthreads_icv > 0)
1992 #if OMP_40_ENABLED
1993         || (proc_bind_icv != proc_bind_default)
1994 #endif /* OMP_40_ENABLED */
1995             ) {
1996       kmp_internal_control_t new_icvs;
1997       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1998       new_icvs.next = NULL;
1999       if (nthreads_icv > 0) {
2000         new_icvs.nproc = nthreads_icv;
2001       }
2002 
2003 #if OMP_40_ENABLED
2004       if (proc_bind_icv != proc_bind_default) {
2005         new_icvs.proc_bind = proc_bind_icv;
2006       }
2007 #endif /* OMP_40_ENABLED */
2008 
2009       /* allocate a new parallel team */
2010       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011       team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013                                  ompt_parallel_data,
2014 #endif
2015 #if OMP_40_ENABLED
2016                                  proc_bind,
2017 #endif
2018                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2019     } else {
2020       /* allocate a new parallel team */
2021       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2022       team = __kmp_allocate_team(root, nthreads, nthreads,
2023 #if OMPT_SUPPORT
2024                                  ompt_parallel_data,
2025 #endif
2026 #if OMP_40_ENABLED
2027                                  proc_bind,
2028 #endif
2029                                  &master_th->th.th_current_task->td_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     }
2032     KF_TRACE(
2033         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2034 
2035     /* setup the new team */
2036     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2037     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2038     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2039     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2040     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2041 #if OMPT_SUPPORT
2042     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2043                           return_address);
2044 #endif
2045     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2046 // TODO: parent_team->t.t_level == INT_MAX ???
2047 #if OMP_40_ENABLED
2048     if (!master_th->th.th_teams_microtask || level > teams_level) {
2049 #endif /* OMP_40_ENABLED */
2050       int new_level = parent_team->t.t_level + 1;
2051       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052       new_level = parent_team->t.t_active_level + 1;
2053       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054 #if OMP_40_ENABLED
2055     } else {
2056       // AC: Do not increase parallel level at start of the teams construct
2057       int new_level = parent_team->t.t_level;
2058       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2059       new_level = parent_team->t.t_active_level;
2060       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2061     }
2062 #endif /* OMP_40_ENABLED */
2063     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2064     // set master's schedule as new run-time schedule
2065     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2066 
2067 #if OMP_40_ENABLED
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069 #endif
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2113           // Restore master's nested state if nested hot team
2114           master_th->th.th_task_state =
2115               master_th->th
2116                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2117         } else {
2118 #endif
2119           master_th->th.th_task_state = 0;
2120 #if KMP_NESTED_HOT_TEAMS
2121         }
2122 #endif
2123       }
2124 #if !KMP_NESTED_HOT_TEAMS
2125       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2126                        (team == root->r.r_hot_team));
2127 #endif
2128     }
2129 
2130     KA_TRACE(
2131         20,
2132         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2133          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2134          team->t.t_nproc));
2135     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2136                      (team->t.t_master_tid == 0 &&
2137                       (team->t.t_parent == root->r.r_root_team ||
2138                        team->t.t_parent->t.t_serialized)));
2139     KMP_MB();
2140 
2141     /* now, setup the arguments */
2142     argv = (void **)team->t.t_argv;
2143 #if OMP_40_ENABLED
2144     if (ap) {
2145 #endif /* OMP_40_ENABLED */
2146       for (i = argc - 1; i >= 0; --i) {
2147 // TODO: revert workaround for Intel(R) 64 tracker #96
2148 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2149         void *new_argv = va_arg(*ap, void *);
2150 #else
2151       void *new_argv = va_arg(ap, void *);
2152 #endif
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156 #if OMP_40_ENABLED
2157     } else {
2158       for (i = 0; i < argc; ++i) {
2159         // Get args from parent team for teams construct
2160         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2161       }
2162     }
2163 #endif /* OMP_40_ENABLED */
2164 
2165     /* now actually fork the threads */
2166     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2167     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2168       root->r.r_active = TRUE;
2169 
2170     __kmp_fork_team_threads(root, team, master_th, gtid);
2171     __kmp_setup_icv_copy(team, nthreads,
2172                          &master_th->th.th_current_task->td_icvs, loc);
2173 
2174 #if OMPT_SUPPORT
2175     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2176 #endif
2177 
2178     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2179 
2180 #if USE_ITT_BUILD
2181     if (team->t.t_active_level == 1 // only report frames at level 1
2182 #if OMP_40_ENABLED
2183         && !master_th->th.th_teams_microtask // not in teams construct
2184 #endif /* OMP_40_ENABLED */
2185         ) {
2186 #if USE_ITT_NOTIFY
2187       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2188           (__kmp_forkjoin_frames_mode == 3 ||
2189            __kmp_forkjoin_frames_mode == 1)) {
2190         kmp_uint64 tmp_time = 0;
2191         if (__itt_get_timestamp_ptr)
2192           tmp_time = __itt_get_timestamp();
2193         // Internal fork - report frame begin
2194         master_th->th.th_frame_time = tmp_time;
2195         if (__kmp_forkjoin_frames_mode == 3)
2196           team->t.t_region_time = tmp_time;
2197       } else
2198 // only one notification scheme (either "submit" or "forking/joined", not both)
2199 #endif /* USE_ITT_NOTIFY */
2200           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2201               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2202         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2203         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2204       }
2205     }
2206 #endif /* USE_ITT_BUILD */
2207 
2208     /* now go on and do the work */
2209     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2210     KMP_MB();
2211     KF_TRACE(10,
2212              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2213               root, team, master_th, gtid));
2214 
2215 #if USE_ITT_BUILD
2216     if (__itt_stack_caller_create_ptr) {
2217       team->t.t_stack_id =
2218           __kmp_itt_stack_caller_create(); // create new stack stitching id
2219       // before entering fork barrier
2220     }
2221 #endif /* USE_ITT_BUILD */
2222 
2223 #if OMP_40_ENABLED
2224     // AC: skip __kmp_internal_fork at teams construct, let only master
2225     // threads execute
2226     if (ap)
2227 #endif /* OMP_40_ENABLED */
2228     {
2229       __kmp_internal_fork(loc, gtid, team);
2230       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2231                     "master_th=%p, gtid=%d\n",
2232                     root, team, master_th, gtid));
2233     }
2234 
2235     if (call_context == fork_context_gnu) {
2236       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2237       return TRUE;
2238     }
2239 
2240     /* Invoke microtask for MASTER thread */
2241     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2242                   team->t.t_id, team->t.t_pkfn));
2243   } // END of timer KMP_fork_call block
2244 
2245   {
2246     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2247     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2248     if (!team->t.t_invoke(gtid)) {
2249       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2250     }
2251   }
2252   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253                 team->t.t_id, team->t.t_pkfn));
2254   KMP_MB(); /* Flush all pending memory write invalidates.  */
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259   if (ompt_enabled.enabled) {
2260     master_th->th.ompt_thread_info.state = omp_state_overhead;
2261   }
2262 #endif
2263 
2264   return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269                                             kmp_team_t *team) {
2270   // restore state outside the region
2271   thread->th.ompt_thread_info.state =
2272       ((team->t.t_serialized) ? omp_state_work_serial
2273                               : omp_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277                                    kmp_team_t *team, ompt_data_t *parallel_data,
2278                                    fork_context_e fork_context, void *codeptr) {
2279   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280   if (ompt_enabled.ompt_callback_parallel_end) {
2281     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2283         codeptr);
2284   }
2285 
2286   task_info->frame.enter_frame = NULL;
2287   __kmp_join_restore_state(thread, team);
2288 }
2289 #endif
2290 
2291 void __kmp_join_call(ident_t *loc, int gtid
2292 #if OMPT_SUPPORT
2293                      ,
2294                      enum fork_context_e fork_context
2295 #endif
2296 #if OMP_40_ENABLED
2297                      ,
2298                      int exit_teams
2299 #endif /* OMP_40_ENABLED */
2300                      ) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307   int i;
2308 
2309   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2310 
2311   /* setup current data */
2312   master_th = __kmp_threads[gtid];
2313   root = master_th->th.th_root;
2314   team = master_th->th.th_team;
2315   parent_team = team->t.t_parent;
2316 
2317   master_th->th.th_ident = loc;
2318 
2319 #if OMPT_SUPPORT
2320   if (ompt_enabled.enabled) {
2321     master_th->th.ompt_thread_info.state = omp_state_overhead;
2322   }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328                   "th_task_team = %p\n",
2329                   __kmp_gtid_from_thread(master_th), team,
2330                   team->t.t_task_team[master_th->th.th_task_state],
2331                   master_th->th.th_task_team));
2332     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333                      team->t.t_task_team[master_th->th.th_task_state]);
2334   }
2335 #endif
2336 
2337   if (team->t.t_serialized) {
2338 #if OMP_40_ENABLED
2339     if (master_th->th.th_teams_microtask) {
2340       // We are in teams construct
2341       int level = team->t.t_level;
2342       int tlevel = master_th->th.th_teams_level;
2343       if (level == tlevel) {
2344         // AC: we haven't incremented it earlier at start of teams construct,
2345         //     so do it here - at the end of teams construct
2346         team->t.t_level++;
2347       } else if (level == tlevel + 1) {
2348         // AC: we are exiting parallel inside teams, need to increment
2349         // serialization in order to restore it in the next call to
2350         // __kmpc_end_serialized_parallel
2351         team->t.t_serialized++;
2352       }
2353     }
2354 #endif /* OMP_40_ENABLED */
2355     __kmpc_end_serialized_parallel(loc, gtid);
2356 
2357 #if OMPT_SUPPORT
2358     if (ompt_enabled.enabled) {
2359       __kmp_join_restore_state(master_th, parent_team);
2360     }
2361 #endif
2362 
2363     return;
2364   }
2365 
2366   master_active = team->t.t_master_active;
2367 
2368 #if OMP_40_ENABLED
2369   if (!exit_teams)
2370 #endif /* OMP_40_ENABLED */
2371   {
2372     // AC: No barrier for internal teams at exit from teams construct.
2373     //     But there is barrier for external team (league).
2374     __kmp_internal_join(loc, gtid, team);
2375   }
2376 #if OMP_40_ENABLED
2377   else {
2378     master_th->th.th_task_state =
2379         0; // AC: no tasking in teams (out of any parallel)
2380   }
2381 #endif /* OMP_40_ENABLED */
2382 
2383   KMP_MB();
2384 
2385 #if OMPT_SUPPORT
2386   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2387   void *codeptr = team->t.ompt_team_info.master_return_address;
2388 #endif
2389 
2390 #if USE_ITT_BUILD
2391   if (__itt_stack_caller_create_ptr) {
2392     __kmp_itt_stack_caller_destroy(
2393         (__itt_caller)team->t
2394             .t_stack_id); // destroy the stack stitching id after join barrier
2395   }
2396 
2397   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2398   if (team->t.t_active_level == 1
2399 #if OMP_40_ENABLED
2400       && !master_th->th.th_teams_microtask /* not in teams construct */
2401 #endif /* OMP_40_ENABLED */
2402       ) {
2403     master_th->th.th_ident = loc;
2404     // only one notification scheme (either "submit" or "forking/joined", not
2405     // both)
2406     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2407         __kmp_forkjoin_frames_mode == 3)
2408       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2409                              master_th->th.th_frame_time, 0, loc,
2410                              master_th->th.th_team_nproc, 1);
2411     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2412              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2413       __kmp_itt_region_joined(gtid);
2414   } // active_level == 1
2415 #endif /* USE_ITT_BUILD */
2416 
2417 #if OMP_40_ENABLED
2418   if (master_th->th.th_teams_microtask && !exit_teams &&
2419       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2420       team->t.t_level == master_th->th.th_teams_level + 1) {
2421     // AC: We need to leave the team structure intact at the end of parallel
2422     // inside the teams construct, so that at the next parallel same (hot) team
2423     // works, only adjust nesting levels
2424 
2425     /* Decrement our nested depth level */
2426     team->t.t_level--;
2427     team->t.t_active_level--;
2428     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2429 
2430     /* Restore number of threads in the team if needed */
2431     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2432       int old_num = master_th->th.th_team_nproc;
2433       int new_num = master_th->th.th_teams_size.nth;
2434       kmp_info_t **other_threads = team->t.t_threads;
2435       team->t.t_nproc = new_num;
2436       for (i = 0; i < old_num; ++i) {
2437         other_threads[i]->th.th_team_nproc = new_num;
2438       }
2439       // Adjust states of non-used threads of the team
2440       for (i = old_num; i < new_num; ++i) {
2441         // Re-initialize thread's barrier data.
2442         int b;
2443         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2444         for (b = 0; b < bs_last_barrier; ++b) {
2445           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2446           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2447 #if USE_DEBUGGER
2448           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2449 #endif
2450         }
2451         if (__kmp_tasking_mode != tskm_immediate_exec) {
2452           // Synchronize thread's task state
2453           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2454         }
2455       }
2456     }
2457 
2458 #if OMPT_SUPPORT
2459     if (ompt_enabled.enabled) {
2460       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2461                       codeptr);
2462     }
2463 #endif
2464 
2465     return;
2466   }
2467 #endif /* OMP_40_ENABLED */
2468 
2469   /* do cleanup and restore the parent team */
2470   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2471   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2472 
2473   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2474 
2475   /* jc: The following lock has instructions with REL and ACQ semantics,
2476      separating the parallel user code called in this parallel region
2477      from the serial user code called after this function returns. */
2478   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2479 
2480 #if OMP_40_ENABLED
2481   if (!master_th->th.th_teams_microtask ||
2482       team->t.t_level > master_th->th.th_teams_level)
2483 #endif /* OMP_40_ENABLED */
2484   {
2485     /* Decrement our nested depth level */
2486     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int ompt_team_size = team->t.t_nproc;
2495       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2496           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2497           __kmp_tid_from_gtid(gtid));
2498     }
2499 
2500     task_info->frame.exit_frame = NULL;
2501     task_info->task_data = ompt_data_none;
2502   }
2503 #endif
2504 
2505   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2506                 master_th, team));
2507   __kmp_pop_current_task_from_thread(master_th);
2508 
2509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2510   // Restore master thread's partition.
2511   master_th->th.th_first_place = team->t.t_first_place;
2512   master_th->th.th_last_place = team->t.t_last_place;
2513 #endif /* OMP_40_ENABLED */
2514 
2515   updateHWFPControl(team);
2516 
2517   if (root->r.r_active != master_active)
2518     root->r.r_active = master_active;
2519 
2520   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2521                             master_th)); // this will free worker threads
2522 
2523   /* this race was fun to find. make sure the following is in the critical
2524      region otherwise assertions may fail occasionally since the old team may be
2525      reallocated and the hierarchy appears inconsistent. it is actually safe to
2526      run and won't cause any bugs, but will cause those assertion failures. it's
2527      only one deref&assign so might as well put this in the critical region */
2528   master_th->th.th_team = parent_team;
2529   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2530   master_th->th.th_team_master = parent_team->t.t_threads[0];
2531   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2532 
2533   /* restore serialized team, if need be */
2534   if (parent_team->t.t_serialized &&
2535       parent_team != master_th->th.th_serial_team &&
2536       parent_team != root->r.r_root_team) {
2537     __kmp_free_team(root,
2538                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2539     master_th->th.th_serial_team = parent_team;
2540   }
2541 
2542   if (__kmp_tasking_mode != tskm_immediate_exec) {
2543     if (master_th->th.th_task_state_top >
2544         0) { // Restore task state from memo stack
2545       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2546       // Remember master's state if we re-use this nested hot team
2547       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2548           master_th->th.th_task_state;
2549       --master_th->th.th_task_state_top; // pop
2550       // Now restore state at this level
2551       master_th->th.th_task_state =
2552           master_th->th
2553               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2554     }
2555     // Copy the task team from the parent team to the master thread
2556     master_th->th.th_task_team =
2557         parent_team->t.t_task_team[master_th->th.th_task_state];
2558     KA_TRACE(20,
2559              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2560               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2561               parent_team));
2562   }
2563 
2564   // TODO: GEH - cannot do this assertion because root thread not set up as
2565   // executing
2566   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2567   master_th->th.th_current_task->td_flags.executing = 1;
2568 
2569   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2570 
2571 #if OMPT_SUPPORT
2572   if (ompt_enabled.enabled) {
2573     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2574                     codeptr);
2575   }
2576 #endif
2577 
2578   KMP_MB();
2579   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2580 }
2581 
2582 /* Check whether we should push an internal control record onto the
2583    serial team stack.  If so, do it.  */
2584 void __kmp_save_internal_controls(kmp_info_t *thread) {
2585 
2586   if (thread->th.th_team != thread->th.th_serial_team) {
2587     return;
2588   }
2589   if (thread->th.th_team->t.t_serialized > 1) {
2590     int push = 0;
2591 
2592     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2593       push = 1;
2594     } else {
2595       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2596           thread->th.th_team->t.t_serialized) {
2597         push = 1;
2598       }
2599     }
2600     if (push) { /* push a record on the serial team's stack */
2601       kmp_internal_control_t *control =
2602           (kmp_internal_control_t *)__kmp_allocate(
2603               sizeof(kmp_internal_control_t));
2604 
2605       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2606 
2607       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2608 
2609       control->next = thread->th.th_team->t.t_control_stack_top;
2610       thread->th.th_team->t.t_control_stack_top = control;
2611     }
2612   }
2613 }
2614 
2615 /* Changes set_nproc */
2616 void __kmp_set_num_threads(int new_nth, int gtid) {
2617   kmp_info_t *thread;
2618   kmp_root_t *root;
2619 
2620   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2621   KMP_DEBUG_ASSERT(__kmp_init_serial);
2622 
2623   if (new_nth < 1)
2624     new_nth = 1;
2625   else if (new_nth > __kmp_max_nth)
2626     new_nth = __kmp_max_nth;
2627 
2628   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2629   thread = __kmp_threads[gtid];
2630 
2631   __kmp_save_internal_controls(thread);
2632 
2633   set__nproc(thread, new_nth);
2634 
2635   // If this omp_set_num_threads() call will cause the hot team size to be
2636   // reduced (in the absence of a num_threads clause), then reduce it now,
2637   // rather than waiting for the next parallel region.
2638   root = thread->th.th_root;
2639   if (__kmp_init_parallel && (!root->r.r_active) &&
2640       (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644       ) {
2645     kmp_team_t *hot_team = root->r.r_hot_team;
2646     int f;
2647 
2648     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650     // Release the extra threads we don't need any more.
2651     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653       if (__kmp_tasking_mode != tskm_immediate_exec) {
2654         // When decreasing team size, threads no longer in the team should unref
2655         // task team.
2656         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657       }
2658       __kmp_free_thread(hot_team->t.t_threads[f]);
2659       hot_team->t.t_threads[f] = NULL;
2660     }
2661     hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663     if (thread->th.th_hot_teams) {
2664       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666     }
2667 #endif
2668 
2669     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671     // Update the t_nproc field in the threads that are still active.
2672     for (f = 0; f < new_nth; f++) {
2673       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675     }
2676     // Special flag in case omp_set_num_threads() call
2677     hot_team->t.t_size_changed = -1;
2678   }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683   kmp_info_t *thread;
2684 
2685   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686                 "%d = (%d)\n",
2687                 gtid, max_active_levels));
2688   KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690   // validate max_active_levels
2691   if (max_active_levels < 0) {
2692     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693     // We ignore this call if the user has specified a negative value.
2694     // The current setting won't be changed. The last valid setting will be
2695     // used. A warning will be issued (if warnings are allowed as controlled by
2696     // the KMP_WARNINGS env var).
2697     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698                   "max_active_levels for thread %d = (%d)\n",
2699                   gtid, max_active_levels));
2700     return;
2701   }
2702   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703     // it's OK, the max_active_levels is within the valid range: [ 0;
2704     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705     // We allow a zero value. (implementation defined behavior)
2706   } else {
2707     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710     // Current upper limit is MAX_INT. (implementation defined behavior)
2711     // If the input exceeds the upper limit, we correct the input to be the
2712     // upper limit. (implementation defined behavior)
2713     // Actually, the flow should never get here until we use MAX_INT limit.
2714   }
2715   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716                 "max_active_levels for thread %d = (%d)\n",
2717                 gtid, max_active_levels));
2718 
2719   thread = __kmp_threads[gtid];
2720 
2721   __kmp_save_internal_controls(thread);
2722 
2723   set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728   kmp_info_t *thread;
2729 
2730   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731   KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733   thread = __kmp_threads[gtid];
2734   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736                 "curtask_maxaclevel=%d\n",
2737                 gtid, thread->th.th_current_task,
2738                 thread->th.th_current_task->td_icvs.max_active_levels));
2739   return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744   kmp_info_t *thread;
2745   //    kmp_team_t *team;
2746 
2747   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2748                 gtid, (int)kind, chunk));
2749   KMP_DEBUG_ASSERT(__kmp_init_serial);
2750 
2751   // Check if the kind parameter is valid, correct if needed.
2752   // Valid parameters should fit in one of two intervals - standard or extended:
2753   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2754   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2755   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2756       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2757     // TODO: Hint needs attention in case we change the default schedule.
2758     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2759               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2760               __kmp_msg_null);
2761     kind = kmp_sched_default;
2762     chunk = 0; // ignore chunk value in case of bad kind
2763   }
2764 
2765   thread = __kmp_threads[gtid];
2766 
2767   __kmp_save_internal_controls(thread);
2768 
2769   if (kind < kmp_sched_upper_std) {
2770     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2771       // differ static chunked vs. unchunked:  chunk should be invalid to
2772       // indicate unchunked schedule (which is the default)
2773       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2774     } else {
2775       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2776           __kmp_sch_map[kind - kmp_sched_lower - 1];
2777     }
2778   } else {
2779     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2780     //    kmp_sched_lower - 2 ];
2781     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2782         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2783                       kmp_sched_lower - 2];
2784   }
2785   if (kind == kmp_sched_auto || chunk < 1) {
2786     // ignore parameter chunk for schedule auto
2787     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2788   } else {
2789     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2790   }
2791 }
2792 
2793 /* Gets def_sched_var ICV values */
2794 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2795   kmp_info_t *thread;
2796   enum sched_type th_type;
2797 
2798   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2799   KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801   thread = __kmp_threads[gtid];
2802 
2803   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2804 
2805   switch (th_type) {
2806   case kmp_sch_static:
2807   case kmp_sch_static_greedy:
2808   case kmp_sch_static_balanced:
2809     *kind = kmp_sched_static;
2810     *chunk = 0; // chunk was not set, try to show this fact via zero value
2811     return;
2812   case kmp_sch_static_chunked:
2813     *kind = kmp_sched_static;
2814     break;
2815   case kmp_sch_dynamic_chunked:
2816     *kind = kmp_sched_dynamic;
2817     break;
2818   case kmp_sch_guided_chunked:
2819   case kmp_sch_guided_iterative_chunked:
2820   case kmp_sch_guided_analytical_chunked:
2821     *kind = kmp_sched_guided;
2822     break;
2823   case kmp_sch_auto:
2824     *kind = kmp_sched_auto;
2825     break;
2826   case kmp_sch_trapezoidal:
2827     *kind = kmp_sched_trapezoidal;
2828     break;
2829 #if KMP_STATIC_STEAL_ENABLED
2830   case kmp_sch_static_steal:
2831     *kind = kmp_sched_static_steal;
2832     break;
2833 #endif
2834   default:
2835     KMP_FATAL(UnknownSchedulingType, th_type);
2836   }
2837 
2838   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2839 }
2840 
2841 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2842 
2843   int ii, dd;
2844   kmp_team_t *team;
2845   kmp_info_t *thr;
2846 
2847   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2848   KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850   // validate level
2851   if (level == 0)
2852     return 0;
2853   if (level < 0)
2854     return -1;
2855   thr = __kmp_threads[gtid];
2856   team = thr->th.th_team;
2857   ii = team->t.t_level;
2858   if (level > ii)
2859     return -1;
2860 
2861 #if OMP_40_ENABLED
2862   if (thr->th.th_teams_microtask) {
2863     // AC: we are in teams region where multiple nested teams have same level
2864     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2865     if (level <=
2866         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2867       KMP_DEBUG_ASSERT(ii >= tlevel);
2868       // AC: As we need to pass by the teams league, we need to artificially
2869       // increase ii
2870       if (ii == tlevel) {
2871         ii += 2; // three teams have same level
2872       } else {
2873         ii++; // two teams have same level
2874       }
2875     }
2876   }
2877 #endif
2878 
2879   if (ii == level)
2880     return __kmp_tid_from_gtid(gtid);
2881 
2882   dd = team->t.t_serialized;
2883   level++;
2884   while (ii > level) {
2885     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2886     }
2887     if ((team->t.t_serialized) && (!dd)) {
2888       team = team->t.t_parent;
2889       continue;
2890     }
2891     if (ii > level) {
2892       team = team->t.t_parent;
2893       dd = team->t.t_serialized;
2894       ii--;
2895     }
2896   }
2897 
2898   return (dd > 1) ? (0) : (team->t.t_master_tid);
2899 }
2900 
2901 int __kmp_get_team_size(int gtid, int level) {
2902 
2903   int ii, dd;
2904   kmp_team_t *team;
2905   kmp_info_t *thr;
2906 
2907   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2908   KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910   // validate level
2911   if (level == 0)
2912     return 1;
2913   if (level < 0)
2914     return -1;
2915   thr = __kmp_threads[gtid];
2916   team = thr->th.th_team;
2917   ii = team->t.t_level;
2918   if (level > ii)
2919     return -1;
2920 
2921 #if OMP_40_ENABLED
2922   if (thr->th.th_teams_microtask) {
2923     // AC: we are in teams region where multiple nested teams have same level
2924     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2925     if (level <=
2926         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2927       KMP_DEBUG_ASSERT(ii >= tlevel);
2928       // AC: As we need to pass by the teams league, we need to artificially
2929       // increase ii
2930       if (ii == tlevel) {
2931         ii += 2; // three teams have same level
2932       } else {
2933         ii++; // two teams have same level
2934       }
2935     }
2936   }
2937 #endif
2938 
2939   while (ii > level) {
2940     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2941     }
2942     if (team->t.t_serialized && (!dd)) {
2943       team = team->t.t_parent;
2944       continue;
2945     }
2946     if (ii > level) {
2947       team = team->t.t_parent;
2948       ii--;
2949     }
2950   }
2951 
2952   return team->t.t_nproc;
2953 }
2954 
2955 kmp_r_sched_t __kmp_get_schedule_global() {
2956   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2957   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2958   // independently. So one can get the updated schedule here.
2959 
2960   kmp_r_sched_t r_sched;
2961 
2962   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2963   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2964   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2965   // different roots (even in OMP 2.5)
2966   if (__kmp_sched == kmp_sch_static) {
2967     // replace STATIC with more detailed schedule (balanced or greedy)
2968     r_sched.r_sched_type = __kmp_static;
2969   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2970     // replace GUIDED with more detailed schedule (iterative or analytical)
2971     r_sched.r_sched_type = __kmp_guided;
2972   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2973     r_sched.r_sched_type = __kmp_sched;
2974   }
2975 
2976   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2977     // __kmp_chunk may be wrong here (if it was not ever set)
2978     r_sched.chunk = KMP_DEFAULT_CHUNK;
2979   } else {
2980     r_sched.chunk = __kmp_chunk;
2981   }
2982 
2983   return r_sched;
2984 }
2985 
2986 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2987    at least argc number of *t_argv entries for the requested team. */
2988 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2989 
2990   KMP_DEBUG_ASSERT(team);
2991   if (!realloc || argc > team->t.t_max_argc) {
2992 
2993     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2994                    "current entries=%d\n",
2995                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2996     /* if previously allocated heap space for args, free them */
2997     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2998       __kmp_free((void *)team->t.t_argv);
2999 
3000     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3001       /* use unused space in the cache line for arguments */
3002       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3003       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3004                      "argv entries\n",
3005                      team->t.t_id, team->t.t_max_argc));
3006       team->t.t_argv = &team->t.t_inline_argv[0];
3007       if (__kmp_storage_map) {
3008         __kmp_print_storage_map_gtid(
3009             -1, &team->t.t_inline_argv[0],
3010             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3011             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3012             team->t.t_id);
3013       }
3014     } else {
3015       /* allocate space for arguments in the heap */
3016       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3017                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3018                                : 2 * argc;
3019       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3020                      "argv entries\n",
3021                      team->t.t_id, team->t.t_max_argc));
3022       team->t.t_argv =
3023           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3024       if (__kmp_storage_map) {
3025         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3026                                      &team->t.t_argv[team->t.t_max_argc],
3027                                      sizeof(void *) * team->t.t_max_argc,
3028                                      "team_%d.t_argv", team->t.t_id);
3029       }
3030     }
3031   }
3032 }
3033 
3034 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3035   int i;
3036   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3037   team->t.t_threads =
3038       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3039   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3040       sizeof(dispatch_shared_info_t) * num_disp_buff);
3041   team->t.t_dispatch =
3042       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3043   team->t.t_implicit_task_taskdata =
3044       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3045   team->t.t_max_nproc = max_nth;
3046 
3047   /* setup dispatch buffers */
3048   for (i = 0; i < num_disp_buff; ++i) {
3049     team->t.t_disp_buffer[i].buffer_index = i;
3050 #if OMP_45_ENABLED
3051     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3052 #endif
3053   }
3054 }
3055 
3056 static void __kmp_free_team_arrays(kmp_team_t *team) {
3057   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3058   int i;
3059   for (i = 0; i < team->t.t_max_nproc; ++i) {
3060     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3061       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3062       team->t.t_dispatch[i].th_disp_buffer = NULL;
3063     }
3064   }
3065   __kmp_free(team->t.t_threads);
3066   __kmp_free(team->t.t_disp_buffer);
3067   __kmp_free(team->t.t_dispatch);
3068   __kmp_free(team->t.t_implicit_task_taskdata);
3069   team->t.t_threads = NULL;
3070   team->t.t_disp_buffer = NULL;
3071   team->t.t_dispatch = NULL;
3072   team->t.t_implicit_task_taskdata = 0;
3073 }
3074 
3075 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3076   kmp_info_t **oldThreads = team->t.t_threads;
3077 
3078   __kmp_free(team->t.t_disp_buffer);
3079   __kmp_free(team->t.t_dispatch);
3080   __kmp_free(team->t.t_implicit_task_taskdata);
3081   __kmp_allocate_team_arrays(team, max_nth);
3082 
3083   KMP_MEMCPY(team->t.t_threads, oldThreads,
3084              team->t.t_nproc * sizeof(kmp_info_t *));
3085 
3086   __kmp_free(oldThreads);
3087 }
3088 
3089 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3090 
3091   kmp_r_sched_t r_sched =
3092       __kmp_get_schedule_global(); // get current state of scheduling globals
3093 
3094 #if OMP_40_ENABLED
3095   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3096 #endif /* OMP_40_ENABLED */
3097 
3098   kmp_internal_control_t g_icvs = {
3099     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3100     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3101     // for nested parallelism (per thread)
3102     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3103     // adjustment of threads (per thread)
3104     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3105     // whether blocktime is explicitly set
3106     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3107 #if KMP_USE_MONITOR
3108     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3109 // intervals
3110 #endif
3111     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3112     // next parallel region (per thread)
3113     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3114     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3115     // for max_active_levels
3116     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3117 // {sched,chunk} pair
3118 #if OMP_40_ENABLED
3119     __kmp_nested_proc_bind.bind_types[0],
3120     __kmp_default_device,
3121 #endif /* OMP_40_ENABLED */
3122     NULL // struct kmp_internal_control *next;
3123   };
3124 
3125   return g_icvs;
3126 }
3127 
3128 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3129 
3130   kmp_internal_control_t gx_icvs;
3131   gx_icvs.serial_nesting_level =
3132       0; // probably =team->t.t_serial like in save_inter_controls
3133   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3134   gx_icvs.next = NULL;
3135 
3136   return gx_icvs;
3137 }
3138 
3139 static void __kmp_initialize_root(kmp_root_t *root) {
3140   int f;
3141   kmp_team_t *root_team;
3142   kmp_team_t *hot_team;
3143   int hot_team_max_nth;
3144   kmp_r_sched_t r_sched =
3145       __kmp_get_schedule_global(); // get current state of scheduling globals
3146   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3147   KMP_DEBUG_ASSERT(root);
3148   KMP_ASSERT(!root->r.r_begin);
3149 
3150   /* setup the root state structure */
3151   __kmp_init_lock(&root->r.r_begin_lock);
3152   root->r.r_begin = FALSE;
3153   root->r.r_active = FALSE;
3154   root->r.r_in_parallel = 0;
3155   root->r.r_blocktime = __kmp_dflt_blocktime;
3156   root->r.r_nested = __kmp_dflt_nested;
3157   root->r.r_cg_nthreads = 1;
3158 
3159   /* setup the root team for this task */
3160   /* allocate the root team structure */
3161   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163   root_team =
3164       __kmp_allocate_team(root,
3165                           1, // new_nproc
3166                           1, // max_nproc
3167 #if OMPT_SUPPORT
3168                           ompt_data_none, // root parallel id
3169 #endif
3170 #if OMP_40_ENABLED
3171                           __kmp_nested_proc_bind.bind_types[0],
3172 #endif
3173                           &r_icvs,
3174                           0 // argc
3175                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3176                           );
3177 #if USE_DEBUGGER
3178   // Non-NULL value should be assigned to make the debugger display the root
3179   // team.
3180   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3181 #endif
3182 
3183   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3184 
3185   root->r.r_root_team = root_team;
3186   root_team->t.t_control_stack_top = NULL;
3187 
3188   /* initialize root team */
3189   root_team->t.t_threads[0] = NULL;
3190   root_team->t.t_nproc = 1;
3191   root_team->t.t_serialized = 1;
3192   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3193   root_team->t.t_sched.sched = r_sched.sched;
3194   KA_TRACE(
3195       20,
3196       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3197        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3198 
3199   /* setup the  hot team for this task */
3200   /* allocate the hot team structure */
3201   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3202 
3203   hot_team =
3204       __kmp_allocate_team(root,
3205                           1, // new_nproc
3206                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3207 #if OMPT_SUPPORT
3208                           ompt_data_none, // root parallel id
3209 #endif
3210 #if OMP_40_ENABLED
3211                           __kmp_nested_proc_bind.bind_types[0],
3212 #endif
3213                           &r_icvs,
3214                           0 // argc
3215                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3216                           );
3217   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3218 
3219   root->r.r_hot_team = hot_team;
3220   root_team->t.t_control_stack_top = NULL;
3221 
3222   /* first-time initialization */
3223   hot_team->t.t_parent = root_team;
3224 
3225   /* initialize hot team */
3226   hot_team_max_nth = hot_team->t.t_max_nproc;
3227   for (f = 0; f < hot_team_max_nth; ++f) {
3228     hot_team->t.t_threads[f] = NULL;
3229   }
3230   hot_team->t.t_nproc = 1;
3231   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3232   hot_team->t.t_sched.sched = r_sched.sched;
3233   hot_team->t.t_size_changed = 0;
3234 }
3235 
3236 #ifdef KMP_DEBUG
3237 
3238 typedef struct kmp_team_list_item {
3239   kmp_team_p const *entry;
3240   struct kmp_team_list_item *next;
3241 } kmp_team_list_item_t;
3242 typedef kmp_team_list_item_t *kmp_team_list_t;
3243 
3244 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3245     kmp_team_list_t list, // List of teams.
3246     kmp_team_p const *team // Team to add.
3247     ) {
3248 
3249   // List must terminate with item where both entry and next are NULL.
3250   // Team is added to the list only once.
3251   // List is sorted in ascending order by team id.
3252   // Team id is *not* a key.
3253 
3254   kmp_team_list_t l;
3255 
3256   KMP_DEBUG_ASSERT(list != NULL);
3257   if (team == NULL) {
3258     return;
3259   }
3260 
3261   __kmp_print_structure_team_accum(list, team->t.t_parent);
3262   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3263 
3264   // Search list for the team.
3265   l = list;
3266   while (l->next != NULL && l->entry != team) {
3267     l = l->next;
3268   }
3269   if (l->next != NULL) {
3270     return; // Team has been added before, exit.
3271   }
3272 
3273   // Team is not found. Search list again for insertion point.
3274   l = list;
3275   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3276     l = l->next;
3277   }
3278 
3279   // Insert team.
3280   {
3281     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3282         sizeof(kmp_team_list_item_t));
3283     *item = *l;
3284     l->entry = team;
3285     l->next = item;
3286   }
3287 }
3288 
3289 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3290 
3291                                        ) {
3292   __kmp_printf("%s", title);
3293   if (team != NULL) {
3294     __kmp_printf("%2x %p\n", team->t.t_id, team);
3295   } else {
3296     __kmp_printf(" - (nil)\n");
3297   }
3298 }
3299 
3300 static void __kmp_print_structure_thread(char const *title,
3301                                          kmp_info_p const *thread) {
3302   __kmp_printf("%s", title);
3303   if (thread != NULL) {
3304     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3305   } else {
3306     __kmp_printf(" - (nil)\n");
3307   }
3308 }
3309 
3310 void __kmp_print_structure(void) {
3311 
3312   kmp_team_list_t list;
3313 
3314   // Initialize list of teams.
3315   list =
3316       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3317   list->entry = NULL;
3318   list->next = NULL;
3319 
3320   __kmp_printf("\n------------------------------\nGlobal Thread "
3321                "Table\n------------------------------\n");
3322   {
3323     int gtid;
3324     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325       __kmp_printf("%2d", gtid);
3326       if (__kmp_threads != NULL) {
3327         __kmp_printf(" %p", __kmp_threads[gtid]);
3328       }
3329       if (__kmp_root != NULL) {
3330         __kmp_printf(" %p", __kmp_root[gtid]);
3331       }
3332       __kmp_printf("\n");
3333     }
3334   }
3335 
3336   // Print out __kmp_threads array.
3337   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3338                "----------\n");
3339   if (__kmp_threads != NULL) {
3340     int gtid;
3341     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342       kmp_info_t const *thread = __kmp_threads[gtid];
3343       if (thread != NULL) {
3344         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3345         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3346         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3347         __kmp_print_structure_team("    Serial Team:  ",
3348                                    thread->th.th_serial_team);
3349         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3350         __kmp_print_structure_thread("    Master:       ",
3351                                      thread->th.th_team_master);
3352         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3353         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3354 #if OMP_40_ENABLED
3355         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3356 #endif
3357         __kmp_print_structure_thread("    Next in pool: ",
3358                                      thread->th.th_next_pool);
3359         __kmp_printf("\n");
3360         __kmp_print_structure_team_accum(list, thread->th.th_team);
3361         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3362       }
3363     }
3364   } else {
3365     __kmp_printf("Threads array is not allocated.\n");
3366   }
3367 
3368   // Print out __kmp_root array.
3369   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3370                "--------\n");
3371   if (__kmp_root != NULL) {
3372     int gtid;
3373     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3374       kmp_root_t const *root = __kmp_root[gtid];
3375       if (root != NULL) {
3376         __kmp_printf("GTID %2d %p:\n", gtid, root);
3377         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3378         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3379         __kmp_print_structure_thread("    Uber Thread:  ",
3380                                      root->r.r_uber_thread);
3381         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3382         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3383         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3384         __kmp_printf("\n");
3385         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3386         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3387       }
3388     }
3389   } else {
3390     __kmp_printf("Ubers array is not allocated.\n");
3391   }
3392 
3393   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3394                "--------\n");
3395   while (list->next != NULL) {
3396     kmp_team_p const *team = list->entry;
3397     int i;
3398     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3399     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3400     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3401     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3402     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3403     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3404     for (i = 0; i < team->t.t_nproc; ++i) {
3405       __kmp_printf("    Thread %2d:      ", i);
3406       __kmp_print_structure_thread("", team->t.t_threads[i]);
3407     }
3408     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3409     __kmp_printf("\n");
3410     list = list->next;
3411   }
3412 
3413   // Print out __kmp_thread_pool and __kmp_team_pool.
3414   __kmp_printf("\n------------------------------\nPools\n----------------------"
3415                "--------\n");
3416   __kmp_print_structure_thread("Thread pool:          ",
3417                                CCAST(kmp_info_t *, __kmp_thread_pool));
3418   __kmp_print_structure_team("Team pool:            ",
3419                              CCAST(kmp_team_t *, __kmp_team_pool));
3420   __kmp_printf("\n");
3421 
3422   // Free team list.
3423   while (list != NULL) {
3424     kmp_team_list_item_t *item = list;
3425     list = list->next;
3426     KMP_INTERNAL_FREE(item);
3427   }
3428 }
3429 
3430 #endif
3431 
3432 //---------------------------------------------------------------------------
3433 //  Stuff for per-thread fast random number generator
3434 //  Table of primes
3435 static const unsigned __kmp_primes[] = {
3436     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3437     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3438     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3439     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3440     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3441     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3442     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3443     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3444     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3445     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3446     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3447 
3448 //---------------------------------------------------------------------------
3449 //  __kmp_get_random: Get a random number using a linear congruential method.
3450 unsigned short __kmp_get_random(kmp_info_t *thread) {
3451   unsigned x = thread->th.th_x;
3452   unsigned short r = x >> 16;
3453 
3454   thread->th.th_x = x * thread->th.th_a + 1;
3455 
3456   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3457                 thread->th.th_info.ds.ds_tid, r));
3458 
3459   return r;
3460 }
3461 //--------------------------------------------------------
3462 // __kmp_init_random: Initialize a random number generator
3463 void __kmp_init_random(kmp_info_t *thread) {
3464   unsigned seed = thread->th.th_info.ds.ds_tid;
3465 
3466   thread->th.th_a =
3467       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3468   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3469   KA_TRACE(30,
3470            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3471 }
3472 
3473 #if KMP_OS_WINDOWS
3474 /* reclaim array entries for root threads that are already dead, returns number
3475  * reclaimed */
3476 static int __kmp_reclaim_dead_roots(void) {
3477   int i, r = 0;
3478 
3479   for (i = 0; i < __kmp_threads_capacity; ++i) {
3480     if (KMP_UBER_GTID(i) &&
3481         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3482         !__kmp_root[i]
3483              ->r.r_active) { // AC: reclaim only roots died in non-active state
3484       r += __kmp_unregister_root_other_thread(i);
3485     }
3486   }
3487   return r;
3488 }
3489 #endif
3490 
3491 /* This function attempts to create free entries in __kmp_threads and
3492    __kmp_root, and returns the number of free entries generated.
3493 
3494    For Windows* OS static library, the first mechanism used is to reclaim array
3495    entries for root threads that are already dead.
3496 
3497    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3498    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3499    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3500    threadprivate cache array has been created. Synchronization with
3501    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3502 
3503    After any dead root reclamation, if the clipping value allows array expansion
3504    to result in the generation of a total of nNeed free slots, the function does
3505    that expansion. If not, nothing is done beyond the possible initial root
3506    thread reclamation.
3507 
3508    If any argument is negative, the behavior is undefined. */
3509 static int __kmp_expand_threads(int nNeed) {
3510   int added = 0;
3511   int old_tp_cached;
3512   int __kmp_actual_max_nth;
3513 
3514 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3515   /* only for Windows static library */
3516   /* reclaim array entries for root threads that are already dead */
3517   added = __kmp_reclaim_dead_roots();
3518 
3519   if (nNeed) {
3520     nNeed -= added;
3521     if (nNeed < 0)
3522       nNeed = 0;
3523   }
3524 #endif
3525   if (nNeed <= 0)
3526     return added;
3527 
3528   while (1) {
3529     int nTarget;
3530     int minimumRequiredCapacity;
3531     int newCapacity;
3532     kmp_info_t **newThreads;
3533     kmp_root_t **newRoot;
3534 
3535     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3536     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3537     // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3538     // > __kmp_max_nth in one of two ways:
3539     //
3540     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3541     //    may not be resused by another thread, so we may need to increase
3542     //    __kmp_threads_capacity to __kmp_max_nth + 1.
3543     //
3544     // 2) New foreign root(s) are encountered.  We always register new foreign
3545     //    roots. This may cause a smaller # of threads to be allocated at
3546     //    subsequent parallel regions, but the worker threads hang around (and
3547     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3548     //
3549     // Anyway, that is the reason for moving the check to see if
3550     // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3551     // instead of having it performed here. -BB
3552     old_tp_cached = __kmp_tp_cached;
3553     __kmp_actual_max_nth =
3554         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3555     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3556 
3557     /* compute expansion headroom to check if we can expand */
3558     nTarget = nNeed;
3559     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3560       /* possible expansion too small -- give up */
3561       break;
3562     }
3563     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3564 
3565     newCapacity = __kmp_threads_capacity;
3566     do {
3567       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3568                         ? (newCapacity << 1)
3569                         : __kmp_actual_max_nth;
3570     } while (newCapacity < minimumRequiredCapacity);
3571     newThreads = (kmp_info_t **)__kmp_allocate(
3572         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3573         CACHE_LINE);
3574     newRoot = (kmp_root_t **)((char *)newThreads +
3575                               sizeof(kmp_info_t *) * newCapacity);
3576     KMP_MEMCPY(newThreads, __kmp_threads,
3577                __kmp_threads_capacity * sizeof(kmp_info_t *));
3578     KMP_MEMCPY(newRoot, __kmp_root,
3579                __kmp_threads_capacity * sizeof(kmp_root_t *));
3580     memset(newThreads + __kmp_threads_capacity, 0,
3581            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3582     memset(newRoot + __kmp_threads_capacity, 0,
3583            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3584 
3585     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3586       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3587          allocated a threadprivate cache while we were allocating the expanded
3588          array, and our new capacity is larger than the threadprivate cache
3589          capacity, so we should deallocate the expanded arrays and try again.
3590          This is the first check of a double-check pair. */
3591       __kmp_free(newThreads);
3592       continue; /* start over and try again */
3593     }
3594     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3595     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3596       /* Same check as above, but this time with the lock so we can be sure if
3597          we can succeed. */
3598       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3599       __kmp_free(newThreads);
3600       continue; /* start over and try again */
3601     } else {
3602       /* success */
3603       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3604       // investigated.
3605       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3606       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3607       added += newCapacity - __kmp_threads_capacity;
3608       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3609       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3610       break; /* succeeded, so we can exit the loop */
3611     }
3612   }
3613   return added;
3614 }
3615 
3616 /* Register the current thread as a root thread and obtain our gtid. We must
3617    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3618    thread that calls from __kmp_do_serial_initialize() */
3619 int __kmp_register_root(int initial_thread) {
3620   kmp_info_t *root_thread;
3621   kmp_root_t *root;
3622   int gtid;
3623   int capacity;
3624   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3625   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3626   KMP_MB();
3627 
3628   /* 2007-03-02:
3629      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3630      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3631      work as expected -- it may return false (that means there is at least one
3632      empty slot in __kmp_threads array), but it is possible the only free slot
3633      is #0, which is reserved for initial thread and so cannot be used for this
3634      one. Following code workarounds this bug.
3635 
3636      However, right solution seems to be not reserving slot #0 for initial
3637      thread because:
3638      (1) there is no magic in slot #0,
3639      (2) we cannot detect initial thread reliably (the first thread which does
3640         serial initialization may be not a real initial thread).
3641   */
3642   capacity = __kmp_threads_capacity;
3643   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3644     --capacity;
3645   }
3646 
3647   /* see if there are too many threads */
3648   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3649     if (__kmp_tp_cached) {
3650       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3651                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3652                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3653     } else {
3654       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3655                   __kmp_msg_null);
3656     }
3657   }
3658 
3659   /* find an available thread slot */
3660   /* Don't reassign the zero slot since we need that to only be used by initial
3661      thread */
3662   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3663        gtid++)
3664     ;
3665   KA_TRACE(1,
3666            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3667   KMP_ASSERT(gtid < __kmp_threads_capacity);
3668 
3669   /* update global accounting */
3670   __kmp_all_nth++;
3671   TCW_4(__kmp_nth, __kmp_nth + 1);
3672 
3673   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3674   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3675   if (__kmp_adjust_gtid_mode) {
3676     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3677       if (TCR_4(__kmp_gtid_mode) != 2) {
3678         TCW_4(__kmp_gtid_mode, 2);
3679       }
3680     } else {
3681       if (TCR_4(__kmp_gtid_mode) != 1) {
3682         TCW_4(__kmp_gtid_mode, 1);
3683       }
3684     }
3685   }
3686 
3687 #ifdef KMP_ADJUST_BLOCKTIME
3688   /* Adjust blocktime to zero if necessary            */
3689   /* Middle initialization might not have occurred yet */
3690   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3691     if (__kmp_nth > __kmp_avail_proc) {
3692       __kmp_zero_bt = TRUE;
3693     }
3694   }
3695 #endif /* KMP_ADJUST_BLOCKTIME */
3696 
3697   /* setup this new hierarchy */
3698   if (!(root = __kmp_root[gtid])) {
3699     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3700     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3701   }
3702 
3703 #if KMP_STATS_ENABLED
3704   // Initialize stats as soon as possible (right after gtid assignment).
3705   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3706   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3707   KMP_SET_THREAD_STATE(SERIAL_REGION);
3708   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3709 #endif
3710   __kmp_initialize_root(root);
3711 
3712   /* setup new root thread structure */
3713   if (root->r.r_uber_thread) {
3714     root_thread = root->r.r_uber_thread;
3715   } else {
3716     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3717     if (__kmp_storage_map) {
3718       __kmp_print_thread_storage_map(root_thread, gtid);
3719     }
3720     root_thread->th.th_info.ds.ds_gtid = gtid;
3721 #if OMPT_SUPPORT
3722     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3723 #endif
3724     root_thread->th.th_root = root;
3725     if (__kmp_env_consistency_check) {
3726       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3727     }
3728 #if USE_FAST_MEMORY
3729     __kmp_initialize_fast_memory(root_thread);
3730 #endif /* USE_FAST_MEMORY */
3731 
3732 #if KMP_USE_BGET
3733     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3734     __kmp_initialize_bget(root_thread);
3735 #endif
3736     __kmp_init_random(root_thread); // Initialize random number generator
3737   }
3738 
3739   /* setup the serial team held in reserve by the root thread */
3740   if (!root_thread->th.th_serial_team) {
3741     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3742     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3743     root_thread->th.th_serial_team =
3744         __kmp_allocate_team(root, 1, 1,
3745 #if OMPT_SUPPORT
3746                             ompt_data_none, // root parallel id
3747 #endif
3748 #if OMP_40_ENABLED
3749                             proc_bind_default,
3750 #endif
3751                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3752   }
3753   KMP_ASSERT(root_thread->th.th_serial_team);
3754   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3755                 root_thread->th.th_serial_team));
3756 
3757   /* drop root_thread into place */
3758   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3759 
3760   root->r.r_root_team->t.t_threads[0] = root_thread;
3761   root->r.r_hot_team->t.t_threads[0] = root_thread;
3762   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3763   // AC: the team created in reserve, not for execution (it is unused for now).
3764   root_thread->th.th_serial_team->t.t_serialized = 0;
3765   root->r.r_uber_thread = root_thread;
3766 
3767   /* initialize the thread, get it ready to go */
3768   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3769   TCW_4(__kmp_init_gtid, TRUE);
3770 
3771   /* prepare the master thread for get_gtid() */
3772   __kmp_gtid_set_specific(gtid);
3773 
3774 #if USE_ITT_BUILD
3775   __kmp_itt_thread_name(gtid);
3776 #endif /* USE_ITT_BUILD */
3777 
3778 #ifdef KMP_TDATA_GTID
3779   __kmp_gtid = gtid;
3780 #endif
3781   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3782   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3783 
3784   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3785                 "plain=%u\n",
3786                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3787                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3788                 KMP_INIT_BARRIER_STATE));
3789   { // Initialize barrier data.
3790     int b;
3791     for (b = 0; b < bs_last_barrier; ++b) {
3792       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3793 #if USE_DEBUGGER
3794       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3795 #endif
3796     }
3797   }
3798   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3799                    KMP_INIT_BARRIER_STATE);
3800 
3801 #if KMP_AFFINITY_SUPPORTED
3802 #if OMP_40_ENABLED
3803   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3804   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3805   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3806   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3807 #endif
3808 
3809   if (TCR_4(__kmp_init_middle)) {
3810     __kmp_affinity_set_init_mask(gtid, TRUE);
3811   }
3812 #endif /* KMP_AFFINITY_SUPPORTED */
3813 
3814   __kmp_root_counter++;
3815 
3816 #if OMPT_SUPPORT
3817   if (!initial_thread && ompt_enabled.enabled) {
3818 
3819     ompt_thread_t *root_thread = ompt_get_thread();
3820 
3821     ompt_set_thread_state(root_thread, omp_state_overhead);
3822 
3823     if (ompt_enabled.ompt_callback_thread_begin) {
3824       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3825           ompt_thread_initial, __ompt_get_thread_data_internal());
3826     }
3827     ompt_data_t *task_data;
3828     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3829     if (ompt_enabled.ompt_callback_task_create) {
3830       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3831           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3832       // initial task has nothing to return to
3833     }
3834 
3835     ompt_set_thread_state(root_thread, omp_state_work_serial);
3836   }
3837 #endif
3838 
3839   KMP_MB();
3840   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3841 
3842   return gtid;
3843 }
3844 
3845 #if KMP_NESTED_HOT_TEAMS
3846 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3847                                 const int max_level) {
3848   int i, n, nth;
3849   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3850   if (!hot_teams || !hot_teams[level].hot_team) {
3851     return 0;
3852   }
3853   KMP_DEBUG_ASSERT(level < max_level);
3854   kmp_team_t *team = hot_teams[level].hot_team;
3855   nth = hot_teams[level].hot_team_nth;
3856   n = nth - 1; // master is not freed
3857   if (level < max_level - 1) {
3858     for (i = 0; i < nth; ++i) {
3859       kmp_info_t *th = team->t.t_threads[i];
3860       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3861       if (i > 0 && th->th.th_hot_teams) {
3862         __kmp_free(th->th.th_hot_teams);
3863         th->th.th_hot_teams = NULL;
3864       }
3865     }
3866   }
3867   __kmp_free_team(root, team, NULL);
3868   return n;
3869 }
3870 #endif
3871 
3872 // Resets a root thread and clear its root and hot teams.
3873 // Returns the number of __kmp_threads entries directly and indirectly freed.
3874 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3875   kmp_team_t *root_team = root->r.r_root_team;
3876   kmp_team_t *hot_team = root->r.r_hot_team;
3877   int n = hot_team->t.t_nproc;
3878   int i;
3879 
3880   KMP_DEBUG_ASSERT(!root->r.r_active);
3881 
3882   root->r.r_root_team = NULL;
3883   root->r.r_hot_team = NULL;
3884   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3885   // before call to __kmp_free_team().
3886   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3887 #if KMP_NESTED_HOT_TEAMS
3888   if (__kmp_hot_teams_max_level >
3889       0) { // need to free nested hot teams and their threads if any
3890     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3891       kmp_info_t *th = hot_team->t.t_threads[i];
3892       if (__kmp_hot_teams_max_level > 1) {
3893         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3894       }
3895       if (th->th.th_hot_teams) {
3896         __kmp_free(th->th.th_hot_teams);
3897         th->th.th_hot_teams = NULL;
3898       }
3899     }
3900   }
3901 #endif
3902   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3903 
3904   // Before we can reap the thread, we need to make certain that all other
3905   // threads in the teams that had this root as ancestor have stopped trying to
3906   // steal tasks.
3907   if (__kmp_tasking_mode != tskm_immediate_exec) {
3908     __kmp_wait_to_unref_task_teams();
3909   }
3910 
3911 #if KMP_OS_WINDOWS
3912   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3913   KA_TRACE(
3914       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3915            "\n",
3916            (LPVOID) & (root->r.r_uber_thread->th),
3917            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3918   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3919 #endif /* KMP_OS_WINDOWS */
3920 
3921 #if OMPT_SUPPORT
3922   if (ompt_enabled.ompt_callback_thread_end) {
3923     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3924         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3925   }
3926 #endif
3927 
3928   TCW_4(__kmp_nth,
3929         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3930   root->r.r_cg_nthreads--;
3931 
3932   __kmp_reap_thread(root->r.r_uber_thread, 1);
3933 
3934   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3935   // of freeing.
3936   root->r.r_uber_thread = NULL;
3937   /* mark root as no longer in use */
3938   root->r.r_begin = FALSE;
3939 
3940   return n;
3941 }
3942 
3943 void __kmp_unregister_root_current_thread(int gtid) {
3944   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3945   /* this lock should be ok, since unregister_root_current_thread is never
3946      called during an abort, only during a normal close. furthermore, if you
3947      have the forkjoin lock, you should never try to get the initz lock */
3948   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3949   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3950     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3951                   "exiting T#%d\n",
3952                   gtid));
3953     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3954     return;
3955   }
3956   kmp_root_t *root = __kmp_root[gtid];
3957 
3958   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3959   KMP_ASSERT(KMP_UBER_GTID(gtid));
3960   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3961   KMP_ASSERT(root->r.r_active == FALSE);
3962 
3963   KMP_MB();
3964 
3965 #if OMP_45_ENABLED
3966   kmp_info_t *thread = __kmp_threads[gtid];
3967   kmp_team_t *team = thread->th.th_team;
3968   kmp_task_team_t *task_team = thread->th.th_task_team;
3969 
3970   // we need to wait for the proxy tasks before finishing the thread
3971   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3972 #if OMPT_SUPPORT
3973     // the runtime is shutting down so we won't report any events
3974     thread->th.ompt_thread_info.state = omp_state_undefined;
3975 #endif
3976     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3977   }
3978 #endif
3979 
3980   __kmp_reset_root(gtid, root);
3981 
3982   /* free up this thread slot */
3983   __kmp_gtid_set_specific(KMP_GTID_DNE);
3984 #ifdef KMP_TDATA_GTID
3985   __kmp_gtid = KMP_GTID_DNE;
3986 #endif
3987 
3988   KMP_MB();
3989   KC_TRACE(10,
3990            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3991 
3992   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993 }
3994 
3995 #if KMP_OS_WINDOWS
3996 /* __kmp_forkjoin_lock must be already held
3997    Unregisters a root thread that is not the current thread.  Returns the number
3998    of __kmp_threads entries freed as a result. */
3999 static int __kmp_unregister_root_other_thread(int gtid) {
4000   kmp_root_t *root = __kmp_root[gtid];
4001   int r;
4002 
4003   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4004   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4005   KMP_ASSERT(KMP_UBER_GTID(gtid));
4006   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4007   KMP_ASSERT(root->r.r_active == FALSE);
4008 
4009   r = __kmp_reset_root(gtid, root);
4010   KC_TRACE(10,
4011            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4012   return r;
4013 }
4014 #endif
4015 
4016 #if KMP_DEBUG
4017 void __kmp_task_info() {
4018 
4019   kmp_int32 gtid = __kmp_entry_gtid();
4020   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4021   kmp_info_t *this_thr = __kmp_threads[gtid];
4022   kmp_team_t *steam = this_thr->th.th_serial_team;
4023   kmp_team_t *team = this_thr->th.th_team;
4024 
4025   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4026                "ptask=%p\n",
4027                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4028                team->t.t_implicit_task_taskdata[tid].td_parent);
4029 }
4030 #endif // KMP_DEBUG
4031 
4032 /* TODO optimize with one big memclr, take out what isn't needed, split
4033    responsibility to workers as much as possible, and delay initialization of
4034    features as much as possible  */
4035 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4036                                   int tid, int gtid) {
4037   /* this_thr->th.th_info.ds.ds_gtid is setup in
4038      kmp_allocate_thread/create_worker.
4039      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4040   kmp_info_t *master = team->t.t_threads[0];
4041   KMP_DEBUG_ASSERT(this_thr != NULL);
4042   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4043   KMP_DEBUG_ASSERT(team);
4044   KMP_DEBUG_ASSERT(team->t.t_threads);
4045   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4046   KMP_DEBUG_ASSERT(master);
4047   KMP_DEBUG_ASSERT(master->th.th_root);
4048 
4049   KMP_MB();
4050 
4051   TCW_SYNC_PTR(this_thr->th.th_team, team);
4052 
4053   this_thr->th.th_info.ds.ds_tid = tid;
4054   this_thr->th.th_set_nproc = 0;
4055   if (__kmp_tasking_mode != tskm_immediate_exec)
4056     // When tasking is possible, threads are not safe to reap until they are
4057     // done tasking; this will be set when tasking code is exited in wait
4058     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4059   else // no tasking --> always safe to reap
4060     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4061 #if OMP_40_ENABLED
4062   this_thr->th.th_set_proc_bind = proc_bind_default;
4063 #if KMP_AFFINITY_SUPPORTED
4064   this_thr->th.th_new_place = this_thr->th.th_current_place;
4065 #endif
4066 #endif
4067   this_thr->th.th_root = master->th.th_root;
4068 
4069   /* setup the thread's cache of the team structure */
4070   this_thr->th.th_team_nproc = team->t.t_nproc;
4071   this_thr->th.th_team_master = master;
4072   this_thr->th.th_team_serialized = team->t.t_serialized;
4073   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4074 
4075   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4076 
4077   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4078                 tid, gtid, this_thr, this_thr->th.th_current_task));
4079 
4080   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4081                            team, tid, TRUE);
4082 
4083   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4084                 tid, gtid, this_thr, this_thr->th.th_current_task));
4085   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4086   // __kmp_initialize_team()?
4087 
4088   /* TODO no worksharing in speculative threads */
4089   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4090 
4091   this_thr->th.th_local.this_construct = 0;
4092 
4093   if (!this_thr->th.th_pri_common) {
4094     this_thr->th.th_pri_common =
4095         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4096     if (__kmp_storage_map) {
4097       __kmp_print_storage_map_gtid(
4098           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4099           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4100     }
4101     this_thr->th.th_pri_head = NULL;
4102   }
4103 
4104   /* Initialize dynamic dispatch */
4105   {
4106     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4107     // Use team max_nproc since this will never change for the team.
4108     size_t disp_size =
4109         sizeof(dispatch_private_info_t) *
4110         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4111     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4112                   team->t.t_max_nproc));
4113     KMP_ASSERT(dispatch);
4114     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4115     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4116 
4117     dispatch->th_disp_index = 0;
4118 #if OMP_45_ENABLED
4119     dispatch->th_doacross_buf_idx = 0;
4120 #endif
4121     if (!dispatch->th_disp_buffer) {
4122       dispatch->th_disp_buffer =
4123           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4124 
4125       if (__kmp_storage_map) {
4126         __kmp_print_storage_map_gtid(
4127             gtid, &dispatch->th_disp_buffer[0],
4128             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4129                                           ? 1
4130                                           : __kmp_dispatch_num_buffers],
4131             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4132                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4133             gtid, team->t.t_id, gtid);
4134       }
4135     } else {
4136       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4137     }
4138 
4139     dispatch->th_dispatch_pr_current = 0;
4140     dispatch->th_dispatch_sh_current = 0;
4141 
4142     dispatch->th_deo_fcn = 0; /* ORDERED     */
4143     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4144   }
4145 
4146   this_thr->th.th_next_pool = NULL;
4147 
4148   if (!this_thr->th.th_task_state_memo_stack) {
4149     size_t i;
4150     this_thr->th.th_task_state_memo_stack =
4151         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4152     this_thr->th.th_task_state_top = 0;
4153     this_thr->th.th_task_state_stack_sz = 4;
4154     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4155          ++i) // zero init the stack
4156       this_thr->th.th_task_state_memo_stack[i] = 0;
4157   }
4158 
4159   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4160   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4161 
4162   KMP_MB();
4163 }
4164 
4165 /* allocate a new thread for the requesting team. this is only called from
4166    within a forkjoin critical section. we will first try to get an available
4167    thread from the thread pool. if none is available, we will fork a new one
4168    assuming we are able to create a new one. this should be assured, as the
4169    caller should check on this first. */
4170 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4171                                   int new_tid) {
4172   kmp_team_t *serial_team;
4173   kmp_info_t *new_thr;
4174   int new_gtid;
4175 
4176   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4177   KMP_DEBUG_ASSERT(root && team);
4178 #if !KMP_NESTED_HOT_TEAMS
4179   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4180 #endif
4181   KMP_MB();
4182 
4183   /* first, try to get one from the thread pool */
4184   if (__kmp_thread_pool) {
4185 
4186     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4187     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4188     if (new_thr == __kmp_thread_pool_insert_pt) {
4189       __kmp_thread_pool_insert_pt = NULL;
4190     }
4191     TCW_4(new_thr->th.th_in_pool, FALSE);
4192     // Don't touch th_active_in_pool or th_active.
4193     // The worker thread adjusts those flags as it sleeps/awakens.
4194     __kmp_thread_pool_nth--;
4195 
4196     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4197                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4198     KMP_ASSERT(!new_thr->th.th_team);
4199     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4200     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4201 
4202     /* setup the thread structure */
4203     __kmp_initialize_info(new_thr, team, new_tid,
4204                           new_thr->th.th_info.ds.ds_gtid);
4205     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4206 
4207     TCW_4(__kmp_nth, __kmp_nth + 1);
4208     root->r.r_cg_nthreads++;
4209 
4210     new_thr->th.th_task_state = 0;
4211     new_thr->th.th_task_state_top = 0;
4212     new_thr->th.th_task_state_stack_sz = 4;
4213 
4214 #ifdef KMP_ADJUST_BLOCKTIME
4215     /* Adjust blocktime back to zero if necessary */
4216     /* Middle initialization might not have occurred yet */
4217     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4218       if (__kmp_nth > __kmp_avail_proc) {
4219         __kmp_zero_bt = TRUE;
4220       }
4221     }
4222 #endif /* KMP_ADJUST_BLOCKTIME */
4223 
4224 #if KMP_DEBUG
4225     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4226     // KMP_BARRIER_PARENT_FLAG.
4227     int b;
4228     kmp_balign_t *balign = new_thr->th.th_bar;
4229     for (b = 0; b < bs_last_barrier; ++b)
4230       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4231 #endif
4232 
4233     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4234                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4235 
4236     KMP_MB();
4237     return new_thr;
4238   }
4239 
4240   /* no, well fork a new one */
4241   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4242   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4243 
4244 #if KMP_USE_MONITOR
4245   // If this is the first worker thread the RTL is creating, then also
4246   // launch the monitor thread.  We try to do this as early as possible.
4247   if (!TCR_4(__kmp_init_monitor)) {
4248     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4249     if (!TCR_4(__kmp_init_monitor)) {
4250       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4251       TCW_4(__kmp_init_monitor, 1);
4252       __kmp_create_monitor(&__kmp_monitor);
4253       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4254 #if KMP_OS_WINDOWS
4255       // AC: wait until monitor has started. This is a fix for CQ232808.
4256       // The reason is that if the library is loaded/unloaded in a loop with
4257       // small (parallel) work in between, then there is high probability that
4258       // monitor thread started after the library shutdown. At shutdown it is
4259       // too late to cope with the problem, because when the master is in
4260       // DllMain (process detach) the monitor has no chances to start (it is
4261       // blocked), and master has no means to inform the monitor that the
4262       // library has gone, because all the memory which the monitor can access
4263       // is going to be released/reset.
4264       while (TCR_4(__kmp_init_monitor) < 2) {
4265         KMP_YIELD(TRUE);
4266       }
4267       KF_TRACE(10, ("after monitor thread has started\n"));
4268 #endif
4269     }
4270     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4271   }
4272 #endif
4273 
4274   KMP_MB();
4275   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4276     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4277   }
4278 
4279   /* allocate space for it. */
4280   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4281 
4282   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4283 
4284   if (__kmp_storage_map) {
4285     __kmp_print_thread_storage_map(new_thr, new_gtid);
4286   }
4287 
4288   // add the reserve serialized team, initialized from the team's master thread
4289   {
4290     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4291     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4292     new_thr->th.th_serial_team = serial_team =
4293         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4294 #if OMPT_SUPPORT
4295                                           ompt_data_none, // root parallel id
4296 #endif
4297 #if OMP_40_ENABLED
4298                                           proc_bind_default,
4299 #endif
4300                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4301   }
4302   KMP_ASSERT(serial_team);
4303   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4304   // execution (it is unused for now).
4305   serial_team->t.t_threads[0] = new_thr;
4306   KF_TRACE(10,
4307            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4308             new_thr));
4309 
4310   /* setup the thread structures */
4311   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4312 
4313 #if USE_FAST_MEMORY
4314   __kmp_initialize_fast_memory(new_thr);
4315 #endif /* USE_FAST_MEMORY */
4316 
4317 #if KMP_USE_BGET
4318   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4319   __kmp_initialize_bget(new_thr);
4320 #endif
4321 
4322   __kmp_init_random(new_thr); // Initialize random number generator
4323 
4324   /* Initialize these only once when thread is grabbed for a team allocation */
4325   KA_TRACE(20,
4326            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4327             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4328 
4329   int b;
4330   kmp_balign_t *balign = new_thr->th.th_bar;
4331   for (b = 0; b < bs_last_barrier; ++b) {
4332     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4333     balign[b].bb.team = NULL;
4334     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4335     balign[b].bb.use_oncore_barrier = 0;
4336   }
4337 
4338   new_thr->th.th_spin_here = FALSE;
4339   new_thr->th.th_next_waiting = 0;
4340 
4341 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4342   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4343   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4344   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4345   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4346 #endif
4347 
4348   TCW_4(new_thr->th.th_in_pool, FALSE);
4349   new_thr->th.th_active_in_pool = FALSE;
4350   TCW_4(new_thr->th.th_active, TRUE);
4351 
4352   /* adjust the global counters */
4353   __kmp_all_nth++;
4354   __kmp_nth++;
4355 
4356   root->r.r_cg_nthreads++;
4357 
4358   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4359   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4360   if (__kmp_adjust_gtid_mode) {
4361     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4362       if (TCR_4(__kmp_gtid_mode) != 2) {
4363         TCW_4(__kmp_gtid_mode, 2);
4364       }
4365     } else {
4366       if (TCR_4(__kmp_gtid_mode) != 1) {
4367         TCW_4(__kmp_gtid_mode, 1);
4368       }
4369     }
4370   }
4371 
4372 #ifdef KMP_ADJUST_BLOCKTIME
4373   /* Adjust blocktime back to zero if necessary       */
4374   /* Middle initialization might not have occurred yet */
4375   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4376     if (__kmp_nth > __kmp_avail_proc) {
4377       __kmp_zero_bt = TRUE;
4378     }
4379   }
4380 #endif /* KMP_ADJUST_BLOCKTIME */
4381 
4382   /* actually fork it and create the new worker thread */
4383   KF_TRACE(
4384       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4385   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4386   KF_TRACE(10,
4387            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4388 
4389   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4390                 new_gtid));
4391   KMP_MB();
4392   return new_thr;
4393 }
4394 
4395 /* Reinitialize team for reuse.
4396    The hot team code calls this case at every fork barrier, so EPCC barrier
4397    test are extremely sensitive to changes in it, esp. writes to the team
4398    struct, which cause a cache invalidation in all threads.
4399    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4400 static void __kmp_reinitialize_team(kmp_team_t *team,
4401                                     kmp_internal_control_t *new_icvs,
4402                                     ident_t *loc) {
4403   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4404                 team->t.t_threads[0], team));
4405   KMP_DEBUG_ASSERT(team && new_icvs);
4406   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4407   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4408 
4409   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4410   // Copy ICVs to the master thread's implicit taskdata
4411   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4412   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4413 
4414   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4415                 team->t.t_threads[0], team));
4416 }
4417 
4418 /* Initialize the team data structure.
4419    This assumes the t_threads and t_max_nproc are already set.
4420    Also, we don't touch the arguments */
4421 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4422                                   kmp_internal_control_t *new_icvs,
4423                                   ident_t *loc) {
4424   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4425 
4426   /* verify */
4427   KMP_DEBUG_ASSERT(team);
4428   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4429   KMP_DEBUG_ASSERT(team->t.t_threads);
4430   KMP_MB();
4431 
4432   team->t.t_master_tid = 0; /* not needed */
4433   /* team->t.t_master_bar;        not needed */
4434   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4435   team->t.t_nproc = new_nproc;
4436 
4437   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4438   team->t.t_next_pool = NULL;
4439   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4440    * up hot team */
4441 
4442   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4443   team->t.t_invoke = NULL; /* not needed */
4444 
4445   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4446   team->t.t_sched.sched = new_icvs->sched.sched;
4447 
4448 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4449   team->t.t_fp_control_saved = FALSE; /* not needed */
4450   team->t.t_x87_fpu_control_word = 0; /* not needed */
4451   team->t.t_mxcsr = 0; /* not needed */
4452 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4453 
4454   team->t.t_construct = 0;
4455 
4456   team->t.t_ordered.dt.t_value = 0;
4457   team->t.t_master_active = FALSE;
4458 
4459   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4460 
4461 #ifdef KMP_DEBUG
4462   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4463 #endif
4464   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4465 
4466   team->t.t_control_stack_top = NULL;
4467 
4468   __kmp_reinitialize_team(team, new_icvs, loc);
4469 
4470   KMP_MB();
4471   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4472 }
4473 
4474 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4475 /* Sets full mask for thread and returns old mask, no changes to structures. */
4476 static void
4477 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4478   if (KMP_AFFINITY_CAPABLE()) {
4479     int status;
4480     if (old_mask != NULL) {
4481       status = __kmp_get_system_affinity(old_mask, TRUE);
4482       int error = errno;
4483       if (status != 0) {
4484         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4485                     __kmp_msg_null);
4486       }
4487     }
4488     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4489   }
4490 }
4491 #endif
4492 
4493 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4494 
4495 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4496 // It calculats the worker + master thread's partition based upon the parent
4497 // thread's partition, and binds each worker to a thread in their partition.
4498 // The master thread's partition should already include its current binding.
4499 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4500   // Copy the master thread's place partion to the team struct
4501   kmp_info_t *master_th = team->t.t_threads[0];
4502   KMP_DEBUG_ASSERT(master_th != NULL);
4503   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4504   int first_place = master_th->th.th_first_place;
4505   int last_place = master_th->th.th_last_place;
4506   int masters_place = master_th->th.th_current_place;
4507   team->t.t_first_place = first_place;
4508   team->t.t_last_place = last_place;
4509 
4510   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4511                 "bound to place %d partition = [%d,%d]\n",
4512                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4513                 team->t.t_id, masters_place, first_place, last_place));
4514 
4515   switch (proc_bind) {
4516 
4517   case proc_bind_default:
4518     // serial teams might have the proc_bind policy set to proc_bind_default. It
4519     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4520     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4521     break;
4522 
4523   case proc_bind_master: {
4524     int f;
4525     int n_th = team->t.t_nproc;
4526     for (f = 1; f < n_th; f++) {
4527       kmp_info_t *th = team->t.t_threads[f];
4528       KMP_DEBUG_ASSERT(th != NULL);
4529       th->th.th_first_place = first_place;
4530       th->th.th_last_place = last_place;
4531       th->th.th_new_place = masters_place;
4532 
4533       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4534                      "partition = [%d,%d]\n",
4535                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4536                      f, masters_place, first_place, last_place));
4537     }
4538   } break;
4539 
4540   case proc_bind_close: {
4541     int f;
4542     int n_th = team->t.t_nproc;
4543     int n_places;
4544     if (first_place <= last_place) {
4545       n_places = last_place - first_place + 1;
4546     } else {
4547       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4548     }
4549     if (n_th <= n_places) {
4550       int place = masters_place;
4551       for (f = 1; f < n_th; f++) {
4552         kmp_info_t *th = team->t.t_threads[f];
4553         KMP_DEBUG_ASSERT(th != NULL);
4554 
4555         if (place == last_place) {
4556           place = first_place;
4557         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4558           place = 0;
4559         } else {
4560           place++;
4561         }
4562         th->th.th_first_place = first_place;
4563         th->th.th_last_place = last_place;
4564         th->th.th_new_place = place;
4565 
4566         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4567                        "partition = [%d,%d]\n",
4568                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4569                        team->t.t_id, f, place, first_place, last_place));
4570       }
4571     } else {
4572       int S, rem, gap, s_count;
4573       S = n_th / n_places;
4574       s_count = 0;
4575       rem = n_th - (S * n_places);
4576       gap = rem > 0 ? n_places / rem : n_places;
4577       int place = masters_place;
4578       int gap_ct = gap;
4579       for (f = 0; f < n_th; f++) {
4580         kmp_info_t *th = team->t.t_threads[f];
4581         KMP_DEBUG_ASSERT(th != NULL);
4582 
4583         th->th.th_first_place = first_place;
4584         th->th.th_last_place = last_place;
4585         th->th.th_new_place = place;
4586         s_count++;
4587 
4588         if ((s_count == S) && rem && (gap_ct == gap)) {
4589           // do nothing, add an extra thread to place on next iteration
4590         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4591           // we added an extra thread to this place; move to next place
4592           if (place == last_place) {
4593             place = first_place;
4594           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4595             place = 0;
4596           } else {
4597             place++;
4598           }
4599           s_count = 0;
4600           gap_ct = 1;
4601           rem--;
4602         } else if (s_count == S) { // place full; don't add extra
4603           if (place == last_place) {
4604             place = first_place;
4605           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4606             place = 0;
4607           } else {
4608             place++;
4609           }
4610           gap_ct++;
4611           s_count = 0;
4612         }
4613 
4614         KA_TRACE(100,
4615                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4616                   "partition = [%d,%d]\n",
4617                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4618                   th->th.th_new_place, first_place, last_place));
4619       }
4620       KMP_DEBUG_ASSERT(place == masters_place);
4621     }
4622   } break;
4623 
4624   case proc_bind_spread: {
4625     int f;
4626     int n_th = team->t.t_nproc;
4627     int n_places;
4628     int thidx;
4629     if (first_place <= last_place) {
4630       n_places = last_place - first_place + 1;
4631     } else {
4632       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4633     }
4634     if (n_th <= n_places) {
4635       int place = -1;
4636 
4637       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4638         int S = n_places / n_th;
4639         int s_count, rem, gap, gap_ct;
4640 
4641         place = masters_place;
4642         rem = n_places - n_th * S;
4643         gap = rem ? n_th / rem : 1;
4644         gap_ct = gap;
4645         thidx = n_th;
4646         if (update_master_only == 1)
4647           thidx = 1;
4648         for (f = 0; f < thidx; f++) {
4649           kmp_info_t *th = team->t.t_threads[f];
4650           KMP_DEBUG_ASSERT(th != NULL);
4651 
4652           th->th.th_first_place = place;
4653           th->th.th_new_place = place;
4654           s_count = 1;
4655           while (s_count < S) {
4656             if (place == last_place) {
4657               place = first_place;
4658             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659               place = 0;
4660             } else {
4661               place++;
4662             }
4663             s_count++;
4664           }
4665           if (rem && (gap_ct == gap)) {
4666             if (place == last_place) {
4667               place = first_place;
4668             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4669               place = 0;
4670             } else {
4671               place++;
4672             }
4673             rem--;
4674             gap_ct = 0;
4675           }
4676           th->th.th_last_place = place;
4677           gap_ct++;
4678 
4679           if (place == last_place) {
4680             place = first_place;
4681           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4682             place = 0;
4683           } else {
4684             place++;
4685           }
4686 
4687           KA_TRACE(100,
4688                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4689                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4690                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4691                     f, th->th.th_new_place, th->th.th_first_place,
4692                     th->th.th_last_place, __kmp_affinity_num_masks));
4693         }
4694       } else {
4695         /* Having uniform space of available computation places I can create
4696            T partitions of round(P/T) size and put threads into the first
4697            place of each partition. */
4698         double current = static_cast<double>(masters_place);
4699         double spacing =
4700             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4701         int first, last;
4702         kmp_info_t *th;
4703 
4704         thidx = n_th + 1;
4705         if (update_master_only == 1)
4706           thidx = 1;
4707         for (f = 0; f < thidx; f++) {
4708           first = static_cast<int>(current);
4709           last = static_cast<int>(current + spacing) - 1;
4710           KMP_DEBUG_ASSERT(last >= first);
4711           if (first >= n_places) {
4712             if (masters_place) {
4713               first -= n_places;
4714               last -= n_places;
4715               if (first == (masters_place + 1)) {
4716                 KMP_DEBUG_ASSERT(f == n_th);
4717                 first--;
4718               }
4719               if (last == masters_place) {
4720                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4721                 last--;
4722               }
4723             } else {
4724               KMP_DEBUG_ASSERT(f == n_th);
4725               first = 0;
4726               last = 0;
4727             }
4728           }
4729           if (last >= n_places) {
4730             last = (n_places - 1);
4731           }
4732           place = first;
4733           current += spacing;
4734           if (f < n_th) {
4735             KMP_DEBUG_ASSERT(0 <= first);
4736             KMP_DEBUG_ASSERT(n_places > first);
4737             KMP_DEBUG_ASSERT(0 <= last);
4738             KMP_DEBUG_ASSERT(n_places > last);
4739             KMP_DEBUG_ASSERT(last_place >= first_place);
4740             th = team->t.t_threads[f];
4741             KMP_DEBUG_ASSERT(th);
4742             th->th.th_first_place = first;
4743             th->th.th_new_place = place;
4744             th->th.th_last_place = last;
4745 
4746             KA_TRACE(100,
4747                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4748                       "partition = [%d,%d], spacing = %.4f\n",
4749                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4750                       team->t.t_id, f, th->th.th_new_place,
4751                       th->th.th_first_place, th->th.th_last_place, spacing));
4752           }
4753         }
4754       }
4755       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4756     } else {
4757       int S, rem, gap, s_count;
4758       S = n_th / n_places;
4759       s_count = 0;
4760       rem = n_th - (S * n_places);
4761       gap = rem > 0 ? n_places / rem : n_places;
4762       int place = masters_place;
4763       int gap_ct = gap;
4764       thidx = n_th;
4765       if (update_master_only == 1)
4766         thidx = 1;
4767       for (f = 0; f < thidx; f++) {
4768         kmp_info_t *th = team->t.t_threads[f];
4769         KMP_DEBUG_ASSERT(th != NULL);
4770 
4771         th->th.th_first_place = place;
4772         th->th.th_last_place = place;
4773         th->th.th_new_place = place;
4774         s_count++;
4775 
4776         if ((s_count == S) && rem && (gap_ct == gap)) {
4777           // do nothing, add an extra thread to place on next iteration
4778         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4779           // we added an extra thread to this place; move on to next place
4780           if (place == last_place) {
4781             place = first_place;
4782           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4783             place = 0;
4784           } else {
4785             place++;
4786           }
4787           s_count = 0;
4788           gap_ct = 1;
4789           rem--;
4790         } else if (s_count == S) { // place is full; don't add extra thread
4791           if (place == last_place) {
4792             place = first_place;
4793           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4794             place = 0;
4795           } else {
4796             place++;
4797           }
4798           gap_ct++;
4799           s_count = 0;
4800         }
4801 
4802         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4803                        "partition = [%d,%d]\n",
4804                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4805                        team->t.t_id, f, th->th.th_new_place,
4806                        th->th.th_first_place, th->th.th_last_place));
4807       }
4808       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4809     }
4810   } break;
4811 
4812   default:
4813     break;
4814   }
4815 
4816   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4817 }
4818 
4819 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4820 
4821 /* allocate a new team data structure to use.  take one off of the free pool if
4822    available */
4823 kmp_team_t *
4824 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4825 #if OMPT_SUPPORT
4826                     ompt_data_t ompt_parallel_data,
4827 #endif
4828 #if OMP_40_ENABLED
4829                     kmp_proc_bind_t new_proc_bind,
4830 #endif
4831                     kmp_internal_control_t *new_icvs,
4832                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4833   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4834   int f;
4835   kmp_team_t *team;
4836   int use_hot_team = !root->r.r_active;
4837   int level = 0;
4838 
4839   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4840   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4841   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4842   KMP_MB();
4843 
4844 #if KMP_NESTED_HOT_TEAMS
4845   kmp_hot_team_ptr_t *hot_teams;
4846   if (master) {
4847     team = master->th.th_team;
4848     level = team->t.t_active_level;
4849     if (master->th.th_teams_microtask) { // in teams construct?
4850       if (master->th.th_teams_size.nteams > 1 &&
4851           ( // #teams > 1
4852               team->t.t_pkfn ==
4853                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4854               master->th.th_teams_level <
4855                   team->t.t_level)) { // or nested parallel inside the teams
4856         ++level; // not increment if #teams==1, or for outer fork of the teams;
4857         // increment otherwise
4858       }
4859     }
4860     hot_teams = master->th.th_hot_teams;
4861     if (level < __kmp_hot_teams_max_level && hot_teams &&
4862         hot_teams[level]
4863             .hot_team) { // hot team has already been allocated for given level
4864       use_hot_team = 1;
4865     } else {
4866       use_hot_team = 0;
4867     }
4868   }
4869 #endif
4870   // Optimization to use a "hot" team
4871   if (use_hot_team && new_nproc > 1) {
4872     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4873 #if KMP_NESTED_HOT_TEAMS
4874     team = hot_teams[level].hot_team;
4875 #else
4876     team = root->r.r_hot_team;
4877 #endif
4878 #if KMP_DEBUG
4879     if (__kmp_tasking_mode != tskm_immediate_exec) {
4880       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4881                     "task_team[1] = %p before reinit\n",
4882                     team->t.t_task_team[0], team->t.t_task_team[1]));
4883     }
4884 #endif
4885 
4886     // Has the number of threads changed?
4887     /* Let's assume the most common case is that the number of threads is
4888        unchanged, and put that case first. */
4889     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4890       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4891       // This case can mean that omp_set_num_threads() was called and the hot
4892       // team size was already reduced, so we check the special flag
4893       if (team->t.t_size_changed == -1) {
4894         team->t.t_size_changed = 1;
4895       } else {
4896         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4897       }
4898 
4899       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4900       kmp_r_sched_t new_sched = new_icvs->sched;
4901       // set master's schedule as new run-time schedule
4902       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4903 
4904       __kmp_reinitialize_team(team, new_icvs,
4905                               root->r.r_uber_thread->th.th_ident);
4906 
4907       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4908                     team->t.t_threads[0], team));
4909       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4910 
4911 #if OMP_40_ENABLED
4912 #if KMP_AFFINITY_SUPPORTED
4913       if ((team->t.t_size_changed == 0) &&
4914           (team->t.t_proc_bind == new_proc_bind)) {
4915         if (new_proc_bind == proc_bind_spread) {
4916           __kmp_partition_places(
4917               team, 1); // add flag to update only master for spread
4918         }
4919         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4920                        "proc_bind = %d, partition = [%d,%d]\n",
4921                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4922                        team->t.t_last_place));
4923       } else {
4924         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4925         __kmp_partition_places(team);
4926       }
4927 #else
4928       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4929 #endif /* KMP_AFFINITY_SUPPORTED */
4930 #endif /* OMP_40_ENABLED */
4931     } else if (team->t.t_nproc > new_nproc) {
4932       KA_TRACE(20,
4933                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4934                 new_nproc));
4935 
4936       team->t.t_size_changed = 1;
4937 #if KMP_NESTED_HOT_TEAMS
4938       if (__kmp_hot_teams_mode == 0) {
4939         // AC: saved number of threads should correspond to team's value in this
4940         // mode, can be bigger in mode 1, when hot team has threads in reserve
4941         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4942         hot_teams[level].hot_team_nth = new_nproc;
4943 #endif // KMP_NESTED_HOT_TEAMS
4944         /* release the extra threads we don't need any more */
4945         for (f = new_nproc; f < team->t.t_nproc; f++) {
4946           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4947           if (__kmp_tasking_mode != tskm_immediate_exec) {
4948             // When decreasing team size, threads no longer in the team should
4949             // unref task team.
4950             team->t.t_threads[f]->th.th_task_team = NULL;
4951           }
4952           __kmp_free_thread(team->t.t_threads[f]);
4953           team->t.t_threads[f] = NULL;
4954         }
4955 #if KMP_NESTED_HOT_TEAMS
4956       } // (__kmp_hot_teams_mode == 0)
4957       else {
4958         // When keeping extra threads in team, switch threads to wait on own
4959         // b_go flag
4960         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4961           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4962           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4963           for (int b = 0; b < bs_last_barrier; ++b) {
4964             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4965               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4966             }
4967             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4968           }
4969         }
4970       }
4971 #endif // KMP_NESTED_HOT_TEAMS
4972       team->t.t_nproc = new_nproc;
4973       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4974       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4975       __kmp_reinitialize_team(team, new_icvs,
4976                               root->r.r_uber_thread->th.th_ident);
4977 
4978       /* update the remaining threads */
4979       for (f = 0; f < new_nproc; ++f) {
4980         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4981       }
4982       // restore the current task state of the master thread: should be the
4983       // implicit task
4984       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4985                     team->t.t_threads[0], team));
4986 
4987       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4988 
4989 #ifdef KMP_DEBUG
4990       for (f = 0; f < team->t.t_nproc; f++) {
4991         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4992                          team->t.t_threads[f]->th.th_team_nproc ==
4993                              team->t.t_nproc);
4994       }
4995 #endif
4996 
4997 #if OMP_40_ENABLED
4998       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4999 #if KMP_AFFINITY_SUPPORTED
5000       __kmp_partition_places(team);
5001 #endif
5002 #endif
5003     } else { // team->t.t_nproc < new_nproc
5004 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5005       kmp_affin_mask_t *old_mask;
5006       if (KMP_AFFINITY_CAPABLE()) {
5007         KMP_CPU_ALLOC(old_mask);
5008       }
5009 #endif
5010 
5011       KA_TRACE(20,
5012                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5013                 new_nproc));
5014 
5015       team->t.t_size_changed = 1;
5016 
5017 #if KMP_NESTED_HOT_TEAMS
5018       int avail_threads = hot_teams[level].hot_team_nth;
5019       if (new_nproc < avail_threads)
5020         avail_threads = new_nproc;
5021       kmp_info_t **other_threads = team->t.t_threads;
5022       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5023         // Adjust barrier data of reserved threads (if any) of the team
5024         // Other data will be set in __kmp_initialize_info() below.
5025         int b;
5026         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5027         for (b = 0; b < bs_last_barrier; ++b) {
5028           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5029           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5030 #if USE_DEBUGGER
5031           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5032 #endif
5033         }
5034       }
5035       if (hot_teams[level].hot_team_nth >= new_nproc) {
5036         // we have all needed threads in reserve, no need to allocate any
5037         // this only possible in mode 1, cannot have reserved threads in mode 0
5038         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5039         team->t.t_nproc = new_nproc; // just get reserved threads involved
5040       } else {
5041         // we may have some threads in reserve, but not enough
5042         team->t.t_nproc =
5043             hot_teams[level]
5044                 .hot_team_nth; // get reserved threads involved if any
5045         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5046 #endif // KMP_NESTED_HOT_TEAMS
5047         if (team->t.t_max_nproc < new_nproc) {
5048           /* reallocate larger arrays */
5049           __kmp_reallocate_team_arrays(team, new_nproc);
5050           __kmp_reinitialize_team(team, new_icvs, NULL);
5051         }
5052 
5053 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5054         /* Temporarily set full mask for master thread before creation of
5055            workers. The reason is that workers inherit the affinity from master,
5056            so if a lot of workers are created on the single core quickly, they
5057            don't get a chance to set their own affinity for a long time. */
5058         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5059 #endif
5060 
5061         /* allocate new threads for the hot team */
5062         for (f = team->t.t_nproc; f < new_nproc; f++) {
5063           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5064           KMP_DEBUG_ASSERT(new_worker);
5065           team->t.t_threads[f] = new_worker;
5066 
5067           KA_TRACE(20,
5068                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5069                     "join=%llu, plain=%llu\n",
5070                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5071                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5072                     team->t.t_bar[bs_plain_barrier].b_arrived));
5073 
5074           { // Initialize barrier data for new threads.
5075             int b;
5076             kmp_balign_t *balign = new_worker->th.th_bar;
5077             for (b = 0; b < bs_last_barrier; ++b) {
5078               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5079               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5080                                KMP_BARRIER_PARENT_FLAG);
5081 #if USE_DEBUGGER
5082               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5083 #endif
5084             }
5085           }
5086         }
5087 
5088 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5089         if (KMP_AFFINITY_CAPABLE()) {
5090           /* Restore initial master thread's affinity mask */
5091           __kmp_set_system_affinity(old_mask, TRUE);
5092           KMP_CPU_FREE(old_mask);
5093         }
5094 #endif
5095 #if KMP_NESTED_HOT_TEAMS
5096       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5097 #endif // KMP_NESTED_HOT_TEAMS
5098       /* make sure everyone is syncronized */
5099       int old_nproc = team->t.t_nproc; // save old value and use to update only
5100       // new threads below
5101       __kmp_initialize_team(team, new_nproc, new_icvs,
5102                             root->r.r_uber_thread->th.th_ident);
5103 
5104       /* reinitialize the threads */
5105       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5106       for (f = 0; f < team->t.t_nproc; ++f)
5107         __kmp_initialize_info(team->t.t_threads[f], team, f,
5108                               __kmp_gtid_from_tid(f, team));
5109       if (level) { // set th_task_state for new threads in nested hot team
5110         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5111         // only need to set the th_task_state for the new threads. th_task_state
5112         // for master thread will not be accurate until after this in
5113         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5114         // correct value.
5115         for (f = old_nproc; f < team->t.t_nproc; ++f)
5116           team->t.t_threads[f]->th.th_task_state =
5117               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5118       } else { // set th_task_state for new threads in non-nested hot team
5119         int old_state =
5120             team->t.t_threads[0]->th.th_task_state; // copy master's state
5121         for (f = old_nproc; f < team->t.t_nproc; ++f)
5122           team->t.t_threads[f]->th.th_task_state = old_state;
5123       }
5124 
5125 #ifdef KMP_DEBUG
5126       for (f = 0; f < team->t.t_nproc; ++f) {
5127         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5128                          team->t.t_threads[f]->th.th_team_nproc ==
5129                              team->t.t_nproc);
5130       }
5131 #endif
5132 
5133 #if OMP_40_ENABLED
5134       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5135 #if KMP_AFFINITY_SUPPORTED
5136       __kmp_partition_places(team);
5137 #endif
5138 #endif
5139     } // Check changes in number of threads
5140 
5141 #if OMP_40_ENABLED
5142     kmp_info_t *master = team->t.t_threads[0];
5143     if (master->th.th_teams_microtask) {
5144       for (f = 1; f < new_nproc; ++f) {
5145         // propagate teams construct specific info to workers
5146         kmp_info_t *thr = team->t.t_threads[f];
5147         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5148         thr->th.th_teams_level = master->th.th_teams_level;
5149         thr->th.th_teams_size = master->th.th_teams_size;
5150       }
5151     }
5152 #endif /* OMP_40_ENABLED */
5153 #if KMP_NESTED_HOT_TEAMS
5154     if (level) {
5155       // Sync barrier state for nested hot teams, not needed for outermost hot
5156       // team.
5157       for (f = 1; f < new_nproc; ++f) {
5158         kmp_info_t *thr = team->t.t_threads[f];
5159         int b;
5160         kmp_balign_t *balign = thr->th.th_bar;
5161         for (b = 0; b < bs_last_barrier; ++b) {
5162           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5163           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5164 #if USE_DEBUGGER
5165           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5166 #endif
5167         }
5168       }
5169     }
5170 #endif // KMP_NESTED_HOT_TEAMS
5171 
5172     /* reallocate space for arguments if necessary */
5173     __kmp_alloc_argv_entries(argc, team, TRUE);
5174     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5175     // The hot team re-uses the previous task team,
5176     // if untouched during the previous release->gather phase.
5177 
5178     KF_TRACE(10, (" hot_team = %p\n", team));
5179 
5180 #if KMP_DEBUG
5181     if (__kmp_tasking_mode != tskm_immediate_exec) {
5182       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5183                     "task_team[1] = %p after reinit\n",
5184                     team->t.t_task_team[0], team->t.t_task_team[1]));
5185     }
5186 #endif
5187 
5188 #if OMPT_SUPPORT
5189     __ompt_team_assign_id(team, ompt_parallel_data);
5190 #endif
5191 
5192     KMP_MB();
5193 
5194     return team;
5195   }
5196 
5197   /* next, let's try to take one from the team pool */
5198   KMP_MB();
5199   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5200     /* TODO: consider resizing undersized teams instead of reaping them, now
5201        that we have a resizing mechanism */
5202     if (team->t.t_max_nproc >= max_nproc) {
5203       /* take this team from the team pool */
5204       __kmp_team_pool = team->t.t_next_pool;
5205 
5206       /* setup the team for fresh use */
5207       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5208 
5209       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5210                     "task_team[1] %p to NULL\n",
5211                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5212       team->t.t_task_team[0] = NULL;
5213       team->t.t_task_team[1] = NULL;
5214 
5215       /* reallocate space for arguments if necessary */
5216       __kmp_alloc_argv_entries(argc, team, TRUE);
5217       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5218 
5219       KA_TRACE(
5220           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5221                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5222       { // Initialize barrier data.
5223         int b;
5224         for (b = 0; b < bs_last_barrier; ++b) {
5225           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5226 #if USE_DEBUGGER
5227           team->t.t_bar[b].b_master_arrived = 0;
5228           team->t.t_bar[b].b_team_arrived = 0;
5229 #endif
5230         }
5231       }
5232 
5233 #if OMP_40_ENABLED
5234       team->t.t_proc_bind = new_proc_bind;
5235 #endif
5236 
5237       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5238                     team->t.t_id));
5239 
5240 #if OMPT_SUPPORT
5241       __ompt_team_assign_id(team, ompt_parallel_data);
5242 #endif
5243 
5244       KMP_MB();
5245 
5246       return team;
5247     }
5248 
5249     /* reap team if it is too small, then loop back and check the next one */
5250     // not sure if this is wise, but, will be redone during the hot-teams
5251     // rewrite.
5252     /* TODO: Use technique to find the right size hot-team, don't reap them */
5253     team = __kmp_reap_team(team);
5254     __kmp_team_pool = team;
5255   }
5256 
5257   /* nothing available in the pool, no matter, make a new team! */
5258   KMP_MB();
5259   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5260 
5261   /* and set it up */
5262   team->t.t_max_nproc = max_nproc;
5263   /* NOTE well, for some reason allocating one big buffer and dividing it up
5264      seems to really hurt performance a lot on the P4, so, let's not use this */
5265   __kmp_allocate_team_arrays(team, max_nproc);
5266 
5267   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5268   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5269 
5270   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5271                 "%p to NULL\n",
5272                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5273   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5274   // memory, no need to duplicate
5275   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5276   // memory, no need to duplicate
5277 
5278   if (__kmp_storage_map) {
5279     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5280   }
5281 
5282   /* allocate space for arguments */
5283   __kmp_alloc_argv_entries(argc, team, FALSE);
5284   team->t.t_argc = argc;
5285 
5286   KA_TRACE(20,
5287            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5288             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5289   { // Initialize barrier data.
5290     int b;
5291     for (b = 0; b < bs_last_barrier; ++b) {
5292       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5293 #if USE_DEBUGGER
5294       team->t.t_bar[b].b_master_arrived = 0;
5295       team->t.t_bar[b].b_team_arrived = 0;
5296 #endif
5297     }
5298   }
5299 
5300 #if OMP_40_ENABLED
5301   team->t.t_proc_bind = new_proc_bind;
5302 #endif
5303 
5304 #if OMPT_SUPPORT
5305   __ompt_team_assign_id(team, ompt_parallel_data);
5306   team->t.ompt_serialized_team_info = NULL;
5307 #endif
5308 
5309   KMP_MB();
5310 
5311   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5312                 team->t.t_id));
5313 
5314   return team;
5315 }
5316 
5317 /* TODO implement hot-teams at all levels */
5318 /* TODO implement lazy thread release on demand (disband request) */
5319 
5320 /* free the team.  return it to the team pool.  release all the threads
5321  * associated with it */
5322 void __kmp_free_team(kmp_root_t *root,
5323                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5324   int f;
5325   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5326                 team->t.t_id));
5327 
5328   /* verify state */
5329   KMP_DEBUG_ASSERT(root);
5330   KMP_DEBUG_ASSERT(team);
5331   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5332   KMP_DEBUG_ASSERT(team->t.t_threads);
5333 
5334   int use_hot_team = team == root->r.r_hot_team;
5335 #if KMP_NESTED_HOT_TEAMS
5336   int level;
5337   kmp_hot_team_ptr_t *hot_teams;
5338   if (master) {
5339     level = team->t.t_active_level - 1;
5340     if (master->th.th_teams_microtask) { // in teams construct?
5341       if (master->th.th_teams_size.nteams > 1) {
5342         ++level; // level was not increased in teams construct for
5343         // team_of_masters
5344       }
5345       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5346           master->th.th_teams_level == team->t.t_level) {
5347         ++level; // level was not increased in teams construct for
5348         // team_of_workers before the parallel
5349       } // team->t.t_level will be increased inside parallel
5350     }
5351     hot_teams = master->th.th_hot_teams;
5352     if (level < __kmp_hot_teams_max_level) {
5353       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5354       use_hot_team = 1;
5355     }
5356   }
5357 #endif // KMP_NESTED_HOT_TEAMS
5358 
5359   /* team is done working */
5360   TCW_SYNC_PTR(team->t.t_pkfn,
5361                NULL); // Important for Debugging Support Library.
5362   team->t.t_copyin_counter = 0; // init counter for possible reuse
5363   // Do not reset pointer to parent team to NULL for hot teams.
5364 
5365   /* if we are non-hot team, release our threads */
5366   if (!use_hot_team) {
5367     if (__kmp_tasking_mode != tskm_immediate_exec) {
5368       // Wait for threads to reach reapable state
5369       for (f = 1; f < team->t.t_nproc; ++f) {
5370         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5371         kmp_info_t *th = team->t.t_threads[f];
5372         volatile kmp_uint32 *state = &th->th.th_reap_state;
5373         while (*state != KMP_SAFE_TO_REAP) {
5374 #if KMP_OS_WINDOWS
5375           // On Windows a thread can be killed at any time, check this
5376           DWORD ecode;
5377           if (!__kmp_is_thread_alive(th, &ecode)) {
5378             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5379             break;
5380           }
5381 #endif
5382           // first check if thread is sleeping
5383           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5384           if (fl.is_sleeping())
5385             fl.resume(__kmp_gtid_from_thread(th));
5386           KMP_CPU_PAUSE();
5387         }
5388       }
5389 
5390       // Delete task teams
5391       int tt_idx;
5392       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5393         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5394         if (task_team != NULL) {
5395           for (f = 0; f < team->t.t_nproc;
5396                ++f) { // Have all threads unref task teams
5397             team->t.t_threads[f]->th.th_task_team = NULL;
5398           }
5399           KA_TRACE(
5400               20,
5401               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5402                __kmp_get_gtid(), task_team, team->t.t_id));
5403 #if KMP_NESTED_HOT_TEAMS
5404           __kmp_free_task_team(master, task_team);
5405 #endif
5406           team->t.t_task_team[tt_idx] = NULL;
5407         }
5408       }
5409     }
5410 
5411     // Reset pointer to parent team only for non-hot teams.
5412     team->t.t_parent = NULL;
5413     team->t.t_level = 0;
5414     team->t.t_active_level = 0;
5415 
5416     /* free the worker threads */
5417     for (f = 1; f < team->t.t_nproc; ++f) {
5418       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5419       __kmp_free_thread(team->t.t_threads[f]);
5420       team->t.t_threads[f] = NULL;
5421     }
5422 
5423     /* put the team back in the team pool */
5424     /* TODO limit size of team pool, call reap_team if pool too large */
5425     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5426     __kmp_team_pool = (volatile kmp_team_t *)team;
5427   }
5428 
5429   KMP_MB();
5430 }
5431 
5432 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5433 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5434   kmp_team_t *next_pool = team->t.t_next_pool;
5435 
5436   KMP_DEBUG_ASSERT(team);
5437   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5438   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5439   KMP_DEBUG_ASSERT(team->t.t_threads);
5440   KMP_DEBUG_ASSERT(team->t.t_argv);
5441 
5442   /* TODO clean the threads that are a part of this? */
5443 
5444   /* free stuff */
5445   __kmp_free_team_arrays(team);
5446   if (team->t.t_argv != &team->t.t_inline_argv[0])
5447     __kmp_free((void *)team->t.t_argv);
5448   __kmp_free(team);
5449 
5450   KMP_MB();
5451   return next_pool;
5452 }
5453 
5454 // Free the thread.  Don't reap it, just place it on the pool of available
5455 // threads.
5456 //
5457 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5458 // binding for the affinity mechanism to be useful.
5459 //
5460 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5461 // However, we want to avoid a potential performance problem by always
5462 // scanning through the list to find the correct point at which to insert
5463 // the thread (potential N**2 behavior).  To do this we keep track of the
5464 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5465 // With single-level parallelism, threads will always be added to the tail
5466 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5467 // parallelism, all bets are off and we may need to scan through the entire
5468 // free list.
5469 //
5470 // This change also has a potentially large performance benefit, for some
5471 // applications.  Previously, as threads were freed from the hot team, they
5472 // would be placed back on the free list in inverse order.  If the hot team
5473 // grew back to it's original size, then the freed thread would be placed
5474 // back on the hot team in reverse order.  This could cause bad cache
5475 // locality problems on programs where the size of the hot team regularly
5476 // grew and shrunk.
5477 //
5478 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5479 void __kmp_free_thread(kmp_info_t *this_th) {
5480   int gtid;
5481   kmp_info_t **scan;
5482   kmp_root_t *root = this_th->th.th_root;
5483 
5484   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5485                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5486 
5487   KMP_DEBUG_ASSERT(this_th);
5488 
5489   // When moving thread to pool, switch thread to wait on own b_go flag, and
5490   // uninitialized (NULL team).
5491   int b;
5492   kmp_balign_t *balign = this_th->th.th_bar;
5493   for (b = 0; b < bs_last_barrier; ++b) {
5494     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5495       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5496     balign[b].bb.team = NULL;
5497     balign[b].bb.leaf_kids = 0;
5498   }
5499   this_th->th.th_task_state = 0;
5500 
5501   /* put thread back on the free pool */
5502   TCW_PTR(this_th->th.th_team, NULL);
5503   TCW_PTR(this_th->th.th_root, NULL);
5504   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5505 
5506   /* If the implicit task assigned to this thread can be used by other threads
5507    * -> multiple threads can share the data and try to free the task at
5508    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5509    * with higher probability when hot team is disabled but can occurs even when
5510    * the hot team is enabled */
5511   __kmp_free_implicit_task(this_th);
5512   this_th->th.th_current_task = NULL;
5513 
5514   // If the __kmp_thread_pool_insert_pt is already past the new insert
5515   // point, then we need to re-scan the entire list.
5516   gtid = this_th->th.th_info.ds.ds_gtid;
5517   if (__kmp_thread_pool_insert_pt != NULL) {
5518     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5519     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5520       __kmp_thread_pool_insert_pt = NULL;
5521     }
5522   }
5523 
5524   // Scan down the list to find the place to insert the thread.
5525   // scan is the address of a link in the list, possibly the address of
5526   // __kmp_thread_pool itself.
5527   //
5528   // In the absence of nested parallism, the for loop will have 0 iterations.
5529   if (__kmp_thread_pool_insert_pt != NULL) {
5530     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5531   } else {
5532     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5533   }
5534   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5535        scan = &((*scan)->th.th_next_pool))
5536     ;
5537 
5538   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5539   // to its address.
5540   TCW_PTR(this_th->th.th_next_pool, *scan);
5541   __kmp_thread_pool_insert_pt = *scan = this_th;
5542   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5543                    (this_th->th.th_info.ds.ds_gtid <
5544                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5545   TCW_4(this_th->th.th_in_pool, TRUE);
5546   __kmp_thread_pool_nth++;
5547 
5548   TCW_4(__kmp_nth, __kmp_nth - 1);
5549   root->r.r_cg_nthreads--;
5550 
5551 #ifdef KMP_ADJUST_BLOCKTIME
5552   /* Adjust blocktime back to user setting or default if necessary */
5553   /* Middle initialization might never have occurred                */
5554   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5555     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5556     if (__kmp_nth <= __kmp_avail_proc) {
5557       __kmp_zero_bt = FALSE;
5558     }
5559   }
5560 #endif /* KMP_ADJUST_BLOCKTIME */
5561 
5562   KMP_MB();
5563 }
5564 
5565 /* ------------------------------------------------------------------------ */
5566 
5567 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5568   int gtid = this_thr->th.th_info.ds.ds_gtid;
5569   /*    void                 *stack_data;*/
5570   kmp_team_t *(*volatile pteam);
5571 
5572   KMP_MB();
5573   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5574 
5575   if (__kmp_env_consistency_check) {
5576     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5577   }
5578 
5579 #if OMPT_SUPPORT
5580   ompt_data_t *thread_data;
5581   if (ompt_enabled.enabled) {
5582     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5583     thread_data->ptr = NULL;
5584 
5585     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5586     this_thr->th.ompt_thread_info.wait_id = 0;
5587     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5588     if (ompt_enabled.ompt_callback_thread_begin) {
5589       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5590           ompt_thread_worker, thread_data);
5591     }
5592   }
5593 #endif
5594 
5595 #if OMPT_SUPPORT
5596   if (ompt_enabled.enabled) {
5597     this_thr->th.ompt_thread_info.state = omp_state_idle;
5598   }
5599 #endif
5600   /* This is the place where threads wait for work */
5601   while (!TCR_4(__kmp_global.g.g_done)) {
5602     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5603     KMP_MB();
5604 
5605     /* wait for work to do */
5606     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5607 
5608     /* No tid yet since not part of a team */
5609     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5610 
5611 #if OMPT_SUPPORT
5612     if (ompt_enabled.enabled) {
5613       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5614     }
5615 #endif
5616 
5617     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5618 
5619     /* have we been allocated? */
5620     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5621       /* we were just woken up, so run our new task */
5622       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5623         int rc;
5624         KA_TRACE(20,
5625                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5626                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5627                   (*pteam)->t.t_pkfn));
5628 
5629         updateHWFPControl(*pteam);
5630 
5631 #if OMPT_SUPPORT
5632         if (ompt_enabled.enabled) {
5633           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5634         }
5635 #endif
5636 
5637         {
5638           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5639           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5640           rc = (*pteam)->t.t_invoke(gtid);
5641         }
5642         KMP_ASSERT(rc);
5643 
5644         KMP_MB();
5645         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5646                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5647                       (*pteam)->t.t_pkfn));
5648       }
5649 #if OMPT_SUPPORT
5650       if (ompt_enabled.enabled) {
5651         /* no frame set while outside task */
5652         __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5653 
5654         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5655       }
5656 #endif
5657       /* join barrier after parallel region */
5658       __kmp_join_barrier(gtid);
5659     }
5660   }
5661   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5662 
5663 #if OMPT_SUPPORT
5664   if (ompt_enabled.ompt_callback_thread_end) {
5665     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5666   }
5667 #endif
5668 
5669   this_thr->th.th_task_team = NULL;
5670   /* run the destructors for the threadprivate data for this thread */
5671   __kmp_common_destroy_gtid(gtid);
5672 
5673   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5674   KMP_MB();
5675   return this_thr;
5676 }
5677 
5678 /* ------------------------------------------------------------------------ */
5679 
5680 void __kmp_internal_end_dest(void *specific_gtid) {
5681 #if KMP_COMPILER_ICC
5682 #pragma warning(push)
5683 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5684 // significant bits
5685 #endif
5686   // Make sure no significant bits are lost
5687   int gtid = (kmp_intptr_t)specific_gtid - 1;
5688 #if KMP_COMPILER_ICC
5689 #pragma warning(pop)
5690 #endif
5691 
5692   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5693   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5694    * this is because 0 is reserved for the nothing-stored case */
5695 
5696   /* josh: One reason for setting the gtid specific data even when it is being
5697      destroyed by pthread is to allow gtid lookup through thread specific data
5698      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5699      that gets executed in the call to __kmp_internal_end_thread, actually
5700      gets the gtid through the thread specific data.  Setting it here seems
5701      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5702      to run smoothly.
5703      todo: get rid of this after we remove the dependence on
5704      __kmp_gtid_get_specific  */
5705   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5706     __kmp_gtid_set_specific(gtid);
5707 #ifdef KMP_TDATA_GTID
5708   __kmp_gtid = gtid;
5709 #endif
5710   __kmp_internal_end_thread(gtid);
5711 }
5712 
5713 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5714 
5715 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5716 // destructors work perfectly, but in real libomp.so I have no evidence it is
5717 // ever called. However, -fini linker option in makefile.mk works fine.
5718 
5719 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5720   __kmp_internal_end_atexit();
5721 }
5722 
5723 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5724 
5725 #endif
5726 
5727 /* [Windows] josh: when the atexit handler is called, there may still be more
5728    than one thread alive */
5729 void __kmp_internal_end_atexit(void) {
5730   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5731   /* [Windows]
5732      josh: ideally, we want to completely shutdown the library in this atexit
5733      handler, but stat code that depends on thread specific data for gtid fails
5734      because that data becomes unavailable at some point during the shutdown, so
5735      we call __kmp_internal_end_thread instead. We should eventually remove the
5736      dependency on __kmp_get_specific_gtid in the stat code and use
5737      __kmp_internal_end_library to cleanly shutdown the library.
5738 
5739      // TODO: Can some of this comment about GVS be removed?
5740      I suspect that the offending stat code is executed when the calling thread
5741      tries to clean up a dead root thread's data structures, resulting in GVS
5742      code trying to close the GVS structures for that thread, but since the stat
5743      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5744      the calling thread is cleaning up itself instead of another thread, it get
5745      confused. This happens because allowing a thread to unregister and cleanup
5746      another thread is a recent modification for addressing an issue.
5747      Based on the current design (20050722), a thread may end up
5748      trying to unregister another thread only if thread death does not trigger
5749      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5750      thread specific data destructor function to detect thread death. For
5751      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5752      is nothing.  Thus, the workaround is applicable only for Windows static
5753      stat library. */
5754   __kmp_internal_end_library(-1);
5755 #if KMP_OS_WINDOWS
5756   __kmp_close_console();
5757 #endif
5758 }
5759 
5760 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5761   // It is assumed __kmp_forkjoin_lock is acquired.
5762 
5763   int gtid;
5764 
5765   KMP_DEBUG_ASSERT(thread != NULL);
5766 
5767   gtid = thread->th.th_info.ds.ds_gtid;
5768 
5769   if (!is_root) {
5770 
5771     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5772       /* Assume the threads are at the fork barrier here */
5773       KA_TRACE(
5774           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5775                gtid));
5776       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5777        * (GEH) */
5778       ANNOTATE_HAPPENS_BEFORE(thread);
5779       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5780       __kmp_release_64(&flag);
5781     }
5782 
5783     // Terminate OS thread.
5784     __kmp_reap_worker(thread);
5785 
5786     // The thread was killed asynchronously.  If it was actively
5787     // spinning in the thread pool, decrement the global count.
5788     //
5789     // There is a small timing hole here - if the worker thread was just waking
5790     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5791     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5792     // the global counter might not get updated.
5793     //
5794     // Currently, this can only happen as the library is unloaded,
5795     // so there are no harmful side effects.
5796     if (thread->th.th_active_in_pool) {
5797       thread->th.th_active_in_pool = FALSE;
5798       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5799       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5800     }
5801 
5802     // Decrement # of [worker] threads in the pool.
5803     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5804     --__kmp_thread_pool_nth;
5805   }
5806 
5807   __kmp_free_implicit_task(thread);
5808 
5809 // Free the fast memory for tasking
5810 #if USE_FAST_MEMORY
5811   __kmp_free_fast_memory(thread);
5812 #endif /* USE_FAST_MEMORY */
5813 
5814   __kmp_suspend_uninitialize_thread(thread);
5815 
5816   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5817   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5818 
5819   --__kmp_all_nth;
5820 // __kmp_nth was decremented when thread is added to the pool.
5821 
5822 #ifdef KMP_ADJUST_BLOCKTIME
5823   /* Adjust blocktime back to user setting or default if necessary */
5824   /* Middle initialization might never have occurred                */
5825   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5826     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5827     if (__kmp_nth <= __kmp_avail_proc) {
5828       __kmp_zero_bt = FALSE;
5829     }
5830   }
5831 #endif /* KMP_ADJUST_BLOCKTIME */
5832 
5833   /* free the memory being used */
5834   if (__kmp_env_consistency_check) {
5835     if (thread->th.th_cons) {
5836       __kmp_free_cons_stack(thread->th.th_cons);
5837       thread->th.th_cons = NULL;
5838     }
5839   }
5840 
5841   if (thread->th.th_pri_common != NULL) {
5842     __kmp_free(thread->th.th_pri_common);
5843     thread->th.th_pri_common = NULL;
5844   }
5845 
5846   if (thread->th.th_task_state_memo_stack != NULL) {
5847     __kmp_free(thread->th.th_task_state_memo_stack);
5848     thread->th.th_task_state_memo_stack = NULL;
5849   }
5850 
5851 #if KMP_USE_BGET
5852   if (thread->th.th_local.bget_data != NULL) {
5853     __kmp_finalize_bget(thread);
5854   }
5855 #endif
5856 
5857 #if KMP_AFFINITY_SUPPORTED
5858   if (thread->th.th_affin_mask != NULL) {
5859     KMP_CPU_FREE(thread->th.th_affin_mask);
5860     thread->th.th_affin_mask = NULL;
5861   }
5862 #endif /* KMP_AFFINITY_SUPPORTED */
5863 
5864   __kmp_reap_team(thread->th.th_serial_team);
5865   thread->th.th_serial_team = NULL;
5866   __kmp_free(thread);
5867 
5868   KMP_MB();
5869 
5870 } // __kmp_reap_thread
5871 
5872 static void __kmp_internal_end(void) {
5873   int i;
5874 
5875   /* First, unregister the library */
5876   __kmp_unregister_library();
5877 
5878 #if KMP_OS_WINDOWS
5879   /* In Win static library, we can't tell when a root actually dies, so we
5880      reclaim the data structures for any root threads that have died but not
5881      unregistered themselves, in order to shut down cleanly.
5882      In Win dynamic library we also can't tell when a thread dies.  */
5883   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5884 // dead roots
5885 #endif
5886 
5887   for (i = 0; i < __kmp_threads_capacity; i++)
5888     if (__kmp_root[i])
5889       if (__kmp_root[i]->r.r_active)
5890         break;
5891   KMP_MB(); /* Flush all pending memory write invalidates.  */
5892   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5893 
5894   if (i < __kmp_threads_capacity) {
5895 #if KMP_USE_MONITOR
5896     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5897     KMP_MB(); /* Flush all pending memory write invalidates.  */
5898 
5899     // Need to check that monitor was initialized before reaping it. If we are
5900     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5901     // __kmp_monitor will appear to contain valid data, but it is only valid in
5902     // the parent process, not the child.
5903     // New behavior (201008): instead of keying off of the flag
5904     // __kmp_init_parallel, the monitor thread creation is keyed off
5905     // of the new flag __kmp_init_monitor.
5906     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5907     if (TCR_4(__kmp_init_monitor)) {
5908       __kmp_reap_monitor(&__kmp_monitor);
5909       TCW_4(__kmp_init_monitor, 0);
5910     }
5911     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5912     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5913 #endif // KMP_USE_MONITOR
5914   } else {
5915 /* TODO move this to cleanup code */
5916 #ifdef KMP_DEBUG
5917     /* make sure that everything has properly ended */
5918     for (i = 0; i < __kmp_threads_capacity; i++) {
5919       if (__kmp_root[i]) {
5920         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5921         //                    there can be uber threads alive here
5922         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5923       }
5924     }
5925 #endif
5926 
5927     KMP_MB();
5928 
5929     // Reap the worker threads.
5930     // This is valid for now, but be careful if threads are reaped sooner.
5931     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5932       // Get the next thread from the pool.
5933       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5934       __kmp_thread_pool = thread->th.th_next_pool;
5935       // Reap it.
5936       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5937       thread->th.th_next_pool = NULL;
5938       thread->th.th_in_pool = FALSE;
5939       __kmp_reap_thread(thread, 0);
5940     }
5941     __kmp_thread_pool_insert_pt = NULL;
5942 
5943     // Reap teams.
5944     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5945       // Get the next team from the pool.
5946       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5947       __kmp_team_pool = team->t.t_next_pool;
5948       // Reap it.
5949       team->t.t_next_pool = NULL;
5950       __kmp_reap_team(team);
5951     }
5952 
5953     __kmp_reap_task_teams();
5954 
5955     for (i = 0; i < __kmp_threads_capacity; ++i) {
5956       // TBD: Add some checking...
5957       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5958     }
5959 
5960     /* Make sure all threadprivate destructors get run by joining with all
5961        worker threads before resetting this flag */
5962     TCW_SYNC_4(__kmp_init_common, FALSE);
5963 
5964     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5965     KMP_MB();
5966 
5967 #if KMP_USE_MONITOR
5968     // See note above: One of the possible fixes for CQ138434 / CQ140126
5969     //
5970     // FIXME: push both code fragments down and CSE them?
5971     // push them into __kmp_cleanup() ?
5972     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5973     if (TCR_4(__kmp_init_monitor)) {
5974       __kmp_reap_monitor(&__kmp_monitor);
5975       TCW_4(__kmp_init_monitor, 0);
5976     }
5977     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5978     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5979 #endif
5980   } /* else !__kmp_global.t_active */
5981   TCW_4(__kmp_init_gtid, FALSE);
5982   KMP_MB(); /* Flush all pending memory write invalidates.  */
5983 
5984   __kmp_cleanup();
5985 #if OMPT_SUPPORT
5986   ompt_fini();
5987 #endif
5988 }
5989 
5990 void __kmp_internal_end_library(int gtid_req) {
5991   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5992   /* this shouldn't be a race condition because __kmp_internal_end() is the
5993      only place to clear __kmp_serial_init */
5994   /* we'll check this later too, after we get the lock */
5995   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5996   // redundaant, because the next check will work in any case.
5997   if (__kmp_global.g.g_abort) {
5998     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5999     /* TODO abort? */
6000     return;
6001   }
6002   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6003     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6004     return;
6005   }
6006 
6007   KMP_MB(); /* Flush all pending memory write invalidates.  */
6008 
6009   /* find out who we are and what we should do */
6010   {
6011     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6012     KA_TRACE(
6013         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6014     if (gtid == KMP_GTID_SHUTDOWN) {
6015       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6016                     "already shutdown\n"));
6017       return;
6018     } else if (gtid == KMP_GTID_MONITOR) {
6019       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6020                     "registered, or system shutdown\n"));
6021       return;
6022     } else if (gtid == KMP_GTID_DNE) {
6023       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6024                     "shutdown\n"));
6025       /* we don't know who we are, but we may still shutdown the library */
6026     } else if (KMP_UBER_GTID(gtid)) {
6027       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6028       if (__kmp_root[gtid]->r.r_active) {
6029         __kmp_global.g.g_abort = -1;
6030         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6031         KA_TRACE(10,
6032                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6033                   gtid));
6034         return;
6035       } else {
6036         KA_TRACE(
6037             10,
6038             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6039         __kmp_unregister_root_current_thread(gtid);
6040       }
6041     } else {
6042 /* worker threads may call this function through the atexit handler, if they
6043  * call exit() */
6044 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6045    TODO: do a thorough shutdown instead */
6046 #ifdef DUMP_DEBUG_ON_EXIT
6047       if (__kmp_debug_buf)
6048         __kmp_dump_debug_buffer();
6049 #endif
6050       return;
6051     }
6052   }
6053   /* synchronize the termination process */
6054   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6055 
6056   /* have we already finished */
6057   if (__kmp_global.g.g_abort) {
6058     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6059     /* TODO abort? */
6060     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6061     return;
6062   }
6063   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6064     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6065     return;
6066   }
6067 
6068   /* We need this lock to enforce mutex between this reading of
6069      __kmp_threads_capacity and the writing by __kmp_register_root.
6070      Alternatively, we can use a counter of roots that is atomically updated by
6071      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6072      __kmp_internal_end_*.  */
6073   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6074 
6075   /* now we can safely conduct the actual termination */
6076   __kmp_internal_end();
6077 
6078   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6079   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6080 
6081   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6082 
6083 #ifdef DUMP_DEBUG_ON_EXIT
6084   if (__kmp_debug_buf)
6085     __kmp_dump_debug_buffer();
6086 #endif
6087 
6088 #if KMP_OS_WINDOWS
6089   __kmp_close_console();
6090 #endif
6091 
6092   __kmp_fini_allocator();
6093 
6094 } // __kmp_internal_end_library
6095 
6096 void __kmp_internal_end_thread(int gtid_req) {
6097   int i;
6098 
6099   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6100   /* this shouldn't be a race condition because __kmp_internal_end() is the
6101    * only place to clear __kmp_serial_init */
6102   /* we'll check this later too, after we get the lock */
6103   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6104   // redundant, because the next check will work in any case.
6105   if (__kmp_global.g.g_abort) {
6106     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6107     /* TODO abort? */
6108     return;
6109   }
6110   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6111     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6112     return;
6113   }
6114 
6115   KMP_MB(); /* Flush all pending memory write invalidates.  */
6116 
6117   /* find out who we are and what we should do */
6118   {
6119     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6120     KA_TRACE(10,
6121              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6122     if (gtid == KMP_GTID_SHUTDOWN) {
6123       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6124                     "already shutdown\n"));
6125       return;
6126     } else if (gtid == KMP_GTID_MONITOR) {
6127       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6128                     "registered, or system shutdown\n"));
6129       return;
6130     } else if (gtid == KMP_GTID_DNE) {
6131       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6132                     "shutdown\n"));
6133       return;
6134       /* we don't know who we are */
6135     } else if (KMP_UBER_GTID(gtid)) {
6136       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6137       if (__kmp_root[gtid]->r.r_active) {
6138         __kmp_global.g.g_abort = -1;
6139         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6140         KA_TRACE(10,
6141                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6142                   gtid));
6143         return;
6144       } else {
6145         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6146                       gtid));
6147         __kmp_unregister_root_current_thread(gtid);
6148       }
6149     } else {
6150       /* just a worker thread, let's leave */
6151       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6152 
6153       if (gtid >= 0) {
6154         __kmp_threads[gtid]->th.th_task_team = NULL;
6155       }
6156 
6157       KA_TRACE(10,
6158                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6159                 gtid));
6160       return;
6161     }
6162   }
6163 #if defined KMP_DYNAMIC_LIB
6164   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6165   // thread, because we will better shutdown later in the library destructor.
6166   // The reason of this change is performance problem when non-openmp thread in
6167   // a loop forks and joins many openmp threads. We can save a lot of time
6168   // keeping worker threads alive until the program shutdown.
6169   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6170   // and Windows(DPD200287443) that occurs when using critical sections from
6171   // foreign threads.
6172   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6173   return;
6174 #endif
6175   /* synchronize the termination process */
6176   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6177 
6178   /* have we already finished */
6179   if (__kmp_global.g.g_abort) {
6180     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6181     /* TODO abort? */
6182     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6183     return;
6184   }
6185   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6186     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6187     return;
6188   }
6189 
6190   /* We need this lock to enforce mutex between this reading of
6191      __kmp_threads_capacity and the writing by __kmp_register_root.
6192      Alternatively, we can use a counter of roots that is atomically updated by
6193      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6194      __kmp_internal_end_*.  */
6195 
6196   /* should we finish the run-time?  are all siblings done? */
6197   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6198 
6199   for (i = 0; i < __kmp_threads_capacity; ++i) {
6200     if (KMP_UBER_GTID(i)) {
6201       KA_TRACE(
6202           10,
6203           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6204       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6205       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6206       return;
6207     }
6208   }
6209 
6210   /* now we can safely conduct the actual termination */
6211 
6212   __kmp_internal_end();
6213 
6214   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6215   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6216 
6217   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6218 
6219 #ifdef DUMP_DEBUG_ON_EXIT
6220   if (__kmp_debug_buf)
6221     __kmp_dump_debug_buffer();
6222 #endif
6223 } // __kmp_internal_end_thread
6224 
6225 // -----------------------------------------------------------------------------
6226 // Library registration stuff.
6227 
6228 static long __kmp_registration_flag = 0;
6229 // Random value used to indicate library initialization.
6230 static char *__kmp_registration_str = NULL;
6231 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6232 
6233 static inline char *__kmp_reg_status_name() {
6234   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6235      each thread. If registration and unregistration go in different threads
6236      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6237      env var can not be found, because the name will contain different pid. */
6238   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6239 } // __kmp_reg_status_get
6240 
6241 void __kmp_register_library_startup(void) {
6242 
6243   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6244   int done = 0;
6245   union {
6246     double dtime;
6247     long ltime;
6248   } time;
6249 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6250   __kmp_initialize_system_tick();
6251 #endif
6252   __kmp_read_system_time(&time.dtime);
6253   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6254   __kmp_registration_str =
6255       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6256                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6257 
6258   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6259                 __kmp_registration_str));
6260 
6261   while (!done) {
6262 
6263     char *value = NULL; // Actual value of the environment variable.
6264 
6265     // Set environment variable, but do not overwrite if it is exist.
6266     __kmp_env_set(name, __kmp_registration_str, 0);
6267     // Check the variable is written.
6268     value = __kmp_env_get(name);
6269     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6270 
6271       done = 1; // Ok, environment variable set successfully, exit the loop.
6272 
6273     } else {
6274 
6275       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6276       // Check whether it alive or dead.
6277       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6278       char *tail = value;
6279       char *flag_addr_str = NULL;
6280       char *flag_val_str = NULL;
6281       char const *file_name = NULL;
6282       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6283       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6284       file_name = tail;
6285       if (tail != NULL) {
6286         long *flag_addr = 0;
6287         long flag_val = 0;
6288         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6289         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6290         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6291           // First, check whether environment-encoded address is mapped into
6292           // addr space.
6293           // If so, dereference it to see if it still has the right value.
6294           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6295             neighbor = 1;
6296           } else {
6297             // If not, then we know the other copy of the library is no longer
6298             // running.
6299             neighbor = 2;
6300           }
6301         }
6302       }
6303       switch (neighbor) {
6304       case 0: // Cannot parse environment variable -- neighbor status unknown.
6305         // Assume it is the incompatible format of future version of the
6306         // library. Assume the other library is alive.
6307         // WARN( ... ); // TODO: Issue a warning.
6308         file_name = "unknown library";
6309       // Attention! Falling to the next case. That's intentional.
6310       case 1: { // Neighbor is alive.
6311         // Check it is allowed.
6312         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6313         if (!__kmp_str_match_true(duplicate_ok)) {
6314           // That's not allowed. Issue fatal error.
6315           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6316                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6317         }
6318         KMP_INTERNAL_FREE(duplicate_ok);
6319         __kmp_duplicate_library_ok = 1;
6320         done = 1; // Exit the loop.
6321       } break;
6322       case 2: { // Neighbor is dead.
6323         // Clear the variable and try to register library again.
6324         __kmp_env_unset(name);
6325       } break;
6326       default: { KMP_DEBUG_ASSERT(0); } break;
6327       }
6328     }
6329     KMP_INTERNAL_FREE((void *)value);
6330   }
6331   KMP_INTERNAL_FREE((void *)name);
6332 
6333 } // func __kmp_register_library_startup
6334 
6335 void __kmp_unregister_library(void) {
6336 
6337   char *name = __kmp_reg_status_name();
6338   char *value = __kmp_env_get(name);
6339 
6340   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6341   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6342   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6343     // Ok, this is our variable. Delete it.
6344     __kmp_env_unset(name);
6345   }
6346 
6347   KMP_INTERNAL_FREE(__kmp_registration_str);
6348   KMP_INTERNAL_FREE(value);
6349   KMP_INTERNAL_FREE(name);
6350 
6351   __kmp_registration_flag = 0;
6352   __kmp_registration_str = NULL;
6353 
6354 } // __kmp_unregister_library
6355 
6356 // End of Library registration stuff.
6357 // -----------------------------------------------------------------------------
6358 
6359 #if KMP_MIC_SUPPORTED
6360 
6361 static void __kmp_check_mic_type() {
6362   kmp_cpuid_t cpuid_state = {0};
6363   kmp_cpuid_t *cs_p = &cpuid_state;
6364   __kmp_x86_cpuid(1, 0, cs_p);
6365   // We don't support mic1 at the moment
6366   if ((cs_p->eax & 0xff0) == 0xB10) {
6367     __kmp_mic_type = mic2;
6368   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6369     __kmp_mic_type = mic3;
6370   } else {
6371     __kmp_mic_type = non_mic;
6372   }
6373 }
6374 
6375 #endif /* KMP_MIC_SUPPORTED */
6376 
6377 static void __kmp_do_serial_initialize(void) {
6378   int i, gtid;
6379   int size;
6380 
6381   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6382 
6383   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6384   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6385   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6386   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6387   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6388 
6389 #if OMPT_SUPPORT
6390   ompt_pre_init();
6391 #endif
6392 
6393   __kmp_validate_locks();
6394 
6395   /* Initialize internal memory allocator */
6396   __kmp_init_allocator();
6397 
6398   /* Register the library startup via an environment variable and check to see
6399      whether another copy of the library is already registered. */
6400 
6401   __kmp_register_library_startup();
6402 
6403   /* TODO reinitialization of library */
6404   if (TCR_4(__kmp_global.g.g_done)) {
6405     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6406   }
6407 
6408   __kmp_global.g.g_abort = 0;
6409   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6410 
6411 /* initialize the locks */
6412 #if KMP_USE_ADAPTIVE_LOCKS
6413 #if KMP_DEBUG_ADAPTIVE_LOCKS
6414   __kmp_init_speculative_stats();
6415 #endif
6416 #endif
6417 #if KMP_STATS_ENABLED
6418   __kmp_stats_init();
6419 #endif
6420   __kmp_init_lock(&__kmp_global_lock);
6421   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6422   __kmp_init_lock(&__kmp_debug_lock);
6423   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6424   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6425   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6426   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6427   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6428   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6429   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6430   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6431   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6432   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6433   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6434   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6435   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6436   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6437   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6438 #if KMP_USE_MONITOR
6439   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6440 #endif
6441   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6442 
6443   /* conduct initialization and initial setup of configuration */
6444 
6445   __kmp_runtime_initialize();
6446 
6447 #if KMP_MIC_SUPPORTED
6448   __kmp_check_mic_type();
6449 #endif
6450 
6451 // Some global variable initialization moved here from kmp_env_initialize()
6452 #ifdef KMP_DEBUG
6453   kmp_diag = 0;
6454 #endif
6455   __kmp_abort_delay = 0;
6456 
6457   // From __kmp_init_dflt_team_nth()
6458   /* assume the entire machine will be used */
6459   __kmp_dflt_team_nth_ub = __kmp_xproc;
6460   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6461     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6462   }
6463   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6464     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6465   }
6466   __kmp_max_nth = __kmp_sys_max_nth;
6467   __kmp_cg_max_nth = __kmp_sys_max_nth;
6468   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6469   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6470     __kmp_teams_max_nth = __kmp_sys_max_nth;
6471   }
6472 
6473   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6474   // part
6475   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6476 #if KMP_USE_MONITOR
6477   __kmp_monitor_wakeups =
6478       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6479   __kmp_bt_intervals =
6480       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6481 #endif
6482   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6483   __kmp_library = library_throughput;
6484   // From KMP_SCHEDULE initialization
6485   __kmp_static = kmp_sch_static_balanced;
6486 // AC: do not use analytical here, because it is non-monotonous
6487 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6488 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6489 // need to repeat assignment
6490 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6491 // bit control and barrier method control parts
6492 #if KMP_FAST_REDUCTION_BARRIER
6493 #define kmp_reduction_barrier_gather_bb ((int)1)
6494 #define kmp_reduction_barrier_release_bb ((int)1)
6495 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6496 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6497 #endif // KMP_FAST_REDUCTION_BARRIER
6498   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6499     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6500     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6501     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6502     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6503 #if KMP_FAST_REDUCTION_BARRIER
6504     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6505       // lin_64 ): hyper,1
6506       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6507       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6508       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6509       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6510     }
6511 #endif // KMP_FAST_REDUCTION_BARRIER
6512   }
6513 #if KMP_FAST_REDUCTION_BARRIER
6514 #undef kmp_reduction_barrier_release_pat
6515 #undef kmp_reduction_barrier_gather_pat
6516 #undef kmp_reduction_barrier_release_bb
6517 #undef kmp_reduction_barrier_gather_bb
6518 #endif // KMP_FAST_REDUCTION_BARRIER
6519 #if KMP_MIC_SUPPORTED
6520   if (__kmp_mic_type == mic2) { // KNC
6521     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6522     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6523     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6524         1; // forkjoin release
6525     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6526     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6527   }
6528 #if KMP_FAST_REDUCTION_BARRIER
6529   if (__kmp_mic_type == mic2) { // KNC
6530     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6531     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6532   }
6533 #endif // KMP_FAST_REDUCTION_BARRIER
6534 #endif // KMP_MIC_SUPPORTED
6535 
6536 // From KMP_CHECKS initialization
6537 #ifdef KMP_DEBUG
6538   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6539 #else
6540   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6541 #endif
6542 
6543   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6544   __kmp_foreign_tp = TRUE;
6545 
6546   __kmp_global.g.g_dynamic = FALSE;
6547   __kmp_global.g.g_dynamic_mode = dynamic_default;
6548 
6549   __kmp_env_initialize(NULL);
6550 
6551 // Print all messages in message catalog for testing purposes.
6552 #ifdef KMP_DEBUG
6553   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6554   if (__kmp_str_match_true(val)) {
6555     kmp_str_buf_t buffer;
6556     __kmp_str_buf_init(&buffer);
6557     __kmp_i18n_dump_catalog(&buffer);
6558     __kmp_printf("%s", buffer.str);
6559     __kmp_str_buf_free(&buffer);
6560   }
6561   __kmp_env_free(&val);
6562 #endif
6563 
6564   __kmp_threads_capacity =
6565       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6566   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6567   __kmp_tp_capacity = __kmp_default_tp_capacity(
6568       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6569 
6570   // If the library is shut down properly, both pools must be NULL. Just in
6571   // case, set them to NULL -- some memory may leak, but subsequent code will
6572   // work even if pools are not freed.
6573   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6574   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6575   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6576   __kmp_thread_pool = NULL;
6577   __kmp_thread_pool_insert_pt = NULL;
6578   __kmp_team_pool = NULL;
6579 
6580   /* Allocate all of the variable sized records */
6581   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6582    * expandable */
6583   /* Since allocation is cache-aligned, just add extra padding at the end */
6584   size =
6585       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6586       CACHE_LINE;
6587   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6588   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6589                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6590 
6591   /* init thread counts */
6592   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6593                    0); // Asserts fail if the library is reinitializing and
6594   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6595   __kmp_all_nth = 0;
6596   __kmp_nth = 0;
6597 
6598   /* setup the uber master thread and hierarchy */
6599   gtid = __kmp_register_root(TRUE);
6600   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6601   KMP_ASSERT(KMP_UBER_GTID(gtid));
6602   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6603 
6604   KMP_MB(); /* Flush all pending memory write invalidates.  */
6605 
6606   __kmp_common_initialize();
6607 
6608 #if KMP_OS_UNIX
6609   /* invoke the child fork handler */
6610   __kmp_register_atfork();
6611 #endif
6612 
6613 #if !defined KMP_DYNAMIC_LIB
6614   {
6615     /* Invoke the exit handler when the program finishes, only for static
6616        library. For dynamic library, we already have _fini and DllMain. */
6617     int rc = atexit(__kmp_internal_end_atexit);
6618     if (rc != 0) {
6619       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6620                   __kmp_msg_null);
6621     }
6622   }
6623 #endif
6624 
6625 #if KMP_HANDLE_SIGNALS
6626 #if KMP_OS_UNIX
6627   /* NOTE: make sure that this is called before the user installs their own
6628      signal handlers so that the user handlers are called first. this way they
6629      can return false, not call our handler, avoid terminating the library, and
6630      continue execution where they left off. */
6631   __kmp_install_signals(FALSE);
6632 #endif /* KMP_OS_UNIX */
6633 #if KMP_OS_WINDOWS
6634   __kmp_install_signals(TRUE);
6635 #endif /* KMP_OS_WINDOWS */
6636 #endif
6637 
6638   /* we have finished the serial initialization */
6639   __kmp_init_counter++;
6640 
6641   __kmp_init_serial = TRUE;
6642 
6643   if (__kmp_settings) {
6644     __kmp_env_print();
6645   }
6646 
6647 #if OMP_40_ENABLED
6648   if (__kmp_display_env || __kmp_display_env_verbose) {
6649     __kmp_env_print_2();
6650   }
6651 #endif // OMP_40_ENABLED
6652 
6653 #if OMPT_SUPPORT
6654   ompt_post_init();
6655 #endif
6656 
6657   KMP_MB();
6658 
6659   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6660 }
6661 
6662 void __kmp_serial_initialize(void) {
6663   if (__kmp_init_serial) {
6664     return;
6665   }
6666   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6667   if (__kmp_init_serial) {
6668     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6669     return;
6670   }
6671   __kmp_do_serial_initialize();
6672   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6673 }
6674 
6675 static void __kmp_do_middle_initialize(void) {
6676   int i, j;
6677   int prev_dflt_team_nth;
6678 
6679   if (!__kmp_init_serial) {
6680     __kmp_do_serial_initialize();
6681   }
6682 
6683   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6684 
6685   // Save the previous value for the __kmp_dflt_team_nth so that
6686   // we can avoid some reinitialization if it hasn't changed.
6687   prev_dflt_team_nth = __kmp_dflt_team_nth;
6688 
6689 #if KMP_AFFINITY_SUPPORTED
6690   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6691   // number of cores on the machine.
6692   __kmp_affinity_initialize();
6693 
6694   // Run through the __kmp_threads array and set the affinity mask
6695   // for each root thread that is currently registered with the RTL.
6696   for (i = 0; i < __kmp_threads_capacity; i++) {
6697     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6698       __kmp_affinity_set_init_mask(i, TRUE);
6699     }
6700   }
6701 #endif /* KMP_AFFINITY_SUPPORTED */
6702 
6703   KMP_ASSERT(__kmp_xproc > 0);
6704   if (__kmp_avail_proc == 0) {
6705     __kmp_avail_proc = __kmp_xproc;
6706   }
6707 
6708   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6709   // correct them now
6710   j = 0;
6711   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6712     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6713         __kmp_avail_proc;
6714     j++;
6715   }
6716 
6717   if (__kmp_dflt_team_nth == 0) {
6718 #ifdef KMP_DFLT_NTH_CORES
6719     // Default #threads = #cores
6720     __kmp_dflt_team_nth = __kmp_ncores;
6721     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6722                   "__kmp_ncores (%d)\n",
6723                   __kmp_dflt_team_nth));
6724 #else
6725     // Default #threads = #available OS procs
6726     __kmp_dflt_team_nth = __kmp_avail_proc;
6727     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6728                   "__kmp_avail_proc(%d)\n",
6729                   __kmp_dflt_team_nth));
6730 #endif /* KMP_DFLT_NTH_CORES */
6731   }
6732 
6733   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6734     __kmp_dflt_team_nth = KMP_MIN_NTH;
6735   }
6736   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6737     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6738   }
6739 
6740   // There's no harm in continuing if the following check fails,
6741   // but it indicates an error in the previous logic.
6742   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6743 
6744   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6745     // Run through the __kmp_threads array and set the num threads icv for each
6746     // root thread that is currently registered with the RTL (which has not
6747     // already explicitly set its nthreads-var with a call to
6748     // omp_set_num_threads()).
6749     for (i = 0; i < __kmp_threads_capacity; i++) {
6750       kmp_info_t *thread = __kmp_threads[i];
6751       if (thread == NULL)
6752         continue;
6753       if (thread->th.th_current_task->td_icvs.nproc != 0)
6754         continue;
6755 
6756       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6757     }
6758   }
6759   KA_TRACE(
6760       20,
6761       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6762        __kmp_dflt_team_nth));
6763 
6764 #ifdef KMP_ADJUST_BLOCKTIME
6765   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6766   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6767     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6768     if (__kmp_nth > __kmp_avail_proc) {
6769       __kmp_zero_bt = TRUE;
6770     }
6771   }
6772 #endif /* KMP_ADJUST_BLOCKTIME */
6773 
6774   /* we have finished middle initialization */
6775   TCW_SYNC_4(__kmp_init_middle, TRUE);
6776 
6777   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6778 }
6779 
6780 void __kmp_middle_initialize(void) {
6781   if (__kmp_init_middle) {
6782     return;
6783   }
6784   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6785   if (__kmp_init_middle) {
6786     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6787     return;
6788   }
6789   __kmp_do_middle_initialize();
6790   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6791 }
6792 
6793 void __kmp_parallel_initialize(void) {
6794   int gtid = __kmp_entry_gtid(); // this might be a new root
6795 
6796   /* synchronize parallel initialization (for sibling) */
6797   if (TCR_4(__kmp_init_parallel))
6798     return;
6799   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6800   if (TCR_4(__kmp_init_parallel)) {
6801     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6802     return;
6803   }
6804 
6805   /* TODO reinitialization after we have already shut down */
6806   if (TCR_4(__kmp_global.g.g_done)) {
6807     KA_TRACE(
6808         10,
6809         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6810     __kmp_infinite_loop();
6811   }
6812 
6813   /* jc: The lock __kmp_initz_lock is already held, so calling
6814      __kmp_serial_initialize would cause a deadlock.  So we call
6815      __kmp_do_serial_initialize directly. */
6816   if (!__kmp_init_middle) {
6817     __kmp_do_middle_initialize();
6818   }
6819 
6820   /* begin initialization */
6821   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6822   KMP_ASSERT(KMP_UBER_GTID(gtid));
6823 
6824 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6825   // Save the FP control regs.
6826   // Worker threads will set theirs to these values at thread startup.
6827   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6828   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6829   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6830 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6831 
6832 #if KMP_OS_UNIX
6833 #if KMP_HANDLE_SIGNALS
6834   /*  must be after __kmp_serial_initialize  */
6835   __kmp_install_signals(TRUE);
6836 #endif
6837 #endif
6838 
6839   __kmp_suspend_initialize();
6840 
6841 #if defined(USE_LOAD_BALANCE)
6842   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6843     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6844   }
6845 #else
6846   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6847     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6848   }
6849 #endif
6850 
6851   if (__kmp_version) {
6852     __kmp_print_version_2();
6853   }
6854 
6855   /* we have finished parallel initialization */
6856   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6857 
6858   KMP_MB();
6859   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6860 
6861   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6862 }
6863 
6864 /* ------------------------------------------------------------------------ */
6865 
6866 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6867                                    kmp_team_t *team) {
6868   kmp_disp_t *dispatch;
6869 
6870   KMP_MB();
6871 
6872   /* none of the threads have encountered any constructs, yet. */
6873   this_thr->th.th_local.this_construct = 0;
6874 #if KMP_CACHE_MANAGE
6875   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6876 #endif /* KMP_CACHE_MANAGE */
6877   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6878   KMP_DEBUG_ASSERT(dispatch);
6879   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6880   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6881   // this_thr->th.th_info.ds.ds_tid ] );
6882 
6883   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6884 #if OMP_45_ENABLED
6885   dispatch->th_doacross_buf_idx =
6886       0; /* reset the doacross dispatch buffer counter */
6887 #endif
6888   if (__kmp_env_consistency_check)
6889     __kmp_push_parallel(gtid, team->t.t_ident);
6890 
6891   KMP_MB(); /* Flush all pending memory write invalidates.  */
6892 }
6893 
6894 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6895                                   kmp_team_t *team) {
6896   if (__kmp_env_consistency_check)
6897     __kmp_pop_parallel(gtid, team->t.t_ident);
6898 
6899   __kmp_finish_implicit_task(this_thr);
6900 }
6901 
6902 int __kmp_invoke_task_func(int gtid) {
6903   int rc;
6904   int tid = __kmp_tid_from_gtid(gtid);
6905   kmp_info_t *this_thr = __kmp_threads[gtid];
6906   kmp_team_t *team = this_thr->th.th_team;
6907 
6908   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6909 #if USE_ITT_BUILD
6910   if (__itt_stack_caller_create_ptr) {
6911     __kmp_itt_stack_callee_enter(
6912         (__itt_caller)
6913             team->t.t_stack_id); // inform ittnotify about entering user's code
6914   }
6915 #endif /* USE_ITT_BUILD */
6916 #if INCLUDE_SSC_MARKS
6917   SSC_MARK_INVOKING();
6918 #endif
6919 
6920 #if OMPT_SUPPORT
6921   void *dummy;
6922   void **exit_runtime_p;
6923   ompt_data_t *my_task_data;
6924   ompt_data_t *my_parallel_data;
6925   int ompt_team_size;
6926 
6927   if (ompt_enabled.enabled) {
6928     exit_runtime_p = &(
6929         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6930   } else {
6931     exit_runtime_p = &dummy;
6932   }
6933 
6934   my_task_data =
6935       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6936   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6937   if (ompt_enabled.ompt_callback_implicit_task) {
6938     ompt_team_size = team->t.t_nproc;
6939     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6940         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6941         __kmp_tid_from_gtid(gtid));
6942   }
6943 #endif
6944 
6945   {
6946     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6947     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6948     rc =
6949         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6950                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6951 #if OMPT_SUPPORT
6952                                ,
6953                                exit_runtime_p
6954 #endif
6955                                );
6956 #if OMPT_SUPPORT
6957     *exit_runtime_p = NULL;
6958 #endif
6959   }
6960 
6961 #if USE_ITT_BUILD
6962   if (__itt_stack_caller_create_ptr) {
6963     __kmp_itt_stack_callee_leave(
6964         (__itt_caller)
6965             team->t.t_stack_id); // inform ittnotify about leaving user's code
6966   }
6967 #endif /* USE_ITT_BUILD */
6968   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6969 
6970   return rc;
6971 }
6972 
6973 #if OMP_40_ENABLED
6974 void __kmp_teams_master(int gtid) {
6975   // This routine is called by all master threads in teams construct
6976   kmp_info_t *thr = __kmp_threads[gtid];
6977   kmp_team_t *team = thr->th.th_team;
6978   ident_t *loc = team->t.t_ident;
6979   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6980   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6981   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6982   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6983                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6984 // Launch league of teams now, but not let workers execute
6985 // (they hang on fork barrier until next parallel)
6986 #if INCLUDE_SSC_MARKS
6987   SSC_MARK_FORKING();
6988 #endif
6989   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6990                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6991                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6992 #if INCLUDE_SSC_MARKS
6993   SSC_MARK_JOINING();
6994 #endif
6995 
6996   // AC: last parameter "1" eliminates join barrier which won't work because
6997   // worker threads are in a fork barrier waiting for more parallel regions
6998   __kmp_join_call(loc, gtid
6999 #if OMPT_SUPPORT
7000                   ,
7001                   fork_context_intel
7002 #endif
7003                   ,
7004                   1);
7005 }
7006 
7007 int __kmp_invoke_teams_master(int gtid) {
7008   kmp_info_t *this_thr = __kmp_threads[gtid];
7009   kmp_team_t *team = this_thr->th.th_team;
7010 #if KMP_DEBUG
7011   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7012     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7013                      (void *)__kmp_teams_master);
7014 #endif
7015   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7016   __kmp_teams_master(gtid);
7017   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7018   return 1;
7019 }
7020 #endif /* OMP_40_ENABLED */
7021 
7022 /* this sets the requested number of threads for the next parallel region
7023    encountered by this team. since this should be enclosed in the forkjoin
7024    critical section it should avoid race conditions with assymmetrical nested
7025    parallelism */
7026 
7027 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7028   kmp_info_t *thr = __kmp_threads[gtid];
7029 
7030   if (num_threads > 0)
7031     thr->th.th_set_nproc = num_threads;
7032 }
7033 
7034 #if OMP_40_ENABLED
7035 
7036 /* this sets the requested number of teams for the teams region and/or
7037    the number of threads for the next parallel region encountered  */
7038 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7039                           int num_threads) {
7040   kmp_info_t *thr = __kmp_threads[gtid];
7041   KMP_DEBUG_ASSERT(num_teams >= 0);
7042   KMP_DEBUG_ASSERT(num_threads >= 0);
7043 
7044   if (num_teams == 0)
7045     num_teams = 1; // default number of teams is 1.
7046   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7047     if (!__kmp_reserve_warn) {
7048       __kmp_reserve_warn = 1;
7049       __kmp_msg(kmp_ms_warning,
7050                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7051                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7052     }
7053     num_teams = __kmp_teams_max_nth;
7054   }
7055   // Set number of teams (number of threads in the outer "parallel" of the
7056   // teams)
7057   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7058 
7059   // Remember the number of threads for inner parallel regions
7060   if (num_threads == 0) {
7061     if (!TCR_4(__kmp_init_middle))
7062       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7063     num_threads = __kmp_avail_proc / num_teams;
7064     if (num_teams * num_threads > __kmp_teams_max_nth) {
7065       // adjust num_threads w/o warning as it is not user setting
7066       num_threads = __kmp_teams_max_nth / num_teams;
7067     }
7068   } else {
7069     if (num_teams * num_threads > __kmp_teams_max_nth) {
7070       int new_threads = __kmp_teams_max_nth / num_teams;
7071       if (!__kmp_reserve_warn) { // user asked for too many threads
7072         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7073         __kmp_msg(kmp_ms_warning,
7074                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7075                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7076       }
7077       num_threads = new_threads;
7078     }
7079   }
7080   thr->th.th_teams_size.nth = num_threads;
7081 }
7082 
7083 // Set the proc_bind var to use in the following parallel region.
7084 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7085   kmp_info_t *thr = __kmp_threads[gtid];
7086   thr->th.th_set_proc_bind = proc_bind;
7087 }
7088 
7089 #endif /* OMP_40_ENABLED */
7090 
7091 /* Launch the worker threads into the microtask. */
7092 
7093 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7094   kmp_info_t *this_thr = __kmp_threads[gtid];
7095 
7096 #ifdef KMP_DEBUG
7097   int f;
7098 #endif /* KMP_DEBUG */
7099 
7100   KMP_DEBUG_ASSERT(team);
7101   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7102   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7103   KMP_MB(); /* Flush all pending memory write invalidates.  */
7104 
7105   team->t.t_construct = 0; /* no single directives seen yet */
7106   team->t.t_ordered.dt.t_value =
7107       0; /* thread 0 enters the ordered section first */
7108 
7109   /* Reset the identifiers on the dispatch buffer */
7110   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7111   if (team->t.t_max_nproc > 1) {
7112     int i;
7113     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7114       team->t.t_disp_buffer[i].buffer_index = i;
7115 #if OMP_45_ENABLED
7116       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7117 #endif
7118     }
7119   } else {
7120     team->t.t_disp_buffer[0].buffer_index = 0;
7121 #if OMP_45_ENABLED
7122     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7123 #endif
7124   }
7125 
7126   KMP_MB(); /* Flush all pending memory write invalidates.  */
7127   KMP_ASSERT(this_thr->th.th_team == team);
7128 
7129 #ifdef KMP_DEBUG
7130   for (f = 0; f < team->t.t_nproc; f++) {
7131     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7132                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7133   }
7134 #endif /* KMP_DEBUG */
7135 
7136   /* release the worker threads so they may begin working */
7137   __kmp_fork_barrier(gtid, 0);
7138 }
7139 
7140 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7141   kmp_info_t *this_thr = __kmp_threads[gtid];
7142 
7143   KMP_DEBUG_ASSERT(team);
7144   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7145   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7146   KMP_MB(); /* Flush all pending memory write invalidates.  */
7147 
7148 /* Join barrier after fork */
7149 
7150 #ifdef KMP_DEBUG
7151   if (__kmp_threads[gtid] &&
7152       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7153     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7154                  __kmp_threads[gtid]);
7155     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7156                  "team->t.t_nproc=%d\n",
7157                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7158                  team->t.t_nproc);
7159     __kmp_print_structure();
7160   }
7161   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7162                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7163 #endif /* KMP_DEBUG */
7164 
7165   __kmp_join_barrier(gtid); /* wait for everyone */
7166 #if OMPT_SUPPORT
7167   int ds_tid = this_thr->th.th_info.ds.ds_tid;
7168   if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7169     ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr);
7170     ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr);
7171     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7172 #if OMPT_OPTIONAL
7173     void *codeptr = NULL;
7174     if (KMP_MASTER_TID(ds_tid) &&
7175         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7176          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7177       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7178 
7179     if (ompt_enabled.ompt_callback_sync_region_wait) {
7180       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7181           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7182     }
7183     if (ompt_enabled.ompt_callback_sync_region) {
7184       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7185           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7186     }
7187 #endif
7188     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7189       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7190           ompt_scope_end, NULL, tId, 0, ds_tid);
7191     }
7192     // return to idle state
7193     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7194   }
7195 #endif
7196 
7197   KMP_MB(); /* Flush all pending memory write invalidates.  */
7198   KMP_ASSERT(this_thr->th.th_team == team);
7199 }
7200 
7201 /* ------------------------------------------------------------------------ */
7202 
7203 #ifdef USE_LOAD_BALANCE
7204 
7205 // Return the worker threads actively spinning in the hot team, if we
7206 // are at the outermost level of parallelism.  Otherwise, return 0.
7207 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7208   int i;
7209   int retval;
7210   kmp_team_t *hot_team;
7211 
7212   if (root->r.r_active) {
7213     return 0;
7214   }
7215   hot_team = root->r.r_hot_team;
7216   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7217     return hot_team->t.t_nproc - 1; // Don't count master thread
7218   }
7219 
7220   // Skip the master thread - it is accounted for elsewhere.
7221   retval = 0;
7222   for (i = 1; i < hot_team->t.t_nproc; i++) {
7223     if (hot_team->t.t_threads[i]->th.th_active) {
7224       retval++;
7225     }
7226   }
7227   return retval;
7228 }
7229 
7230 // Perform an automatic adjustment to the number of
7231 // threads used by the next parallel region.
7232 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7233   int retval;
7234   int pool_active;
7235   int hot_team_active;
7236   int team_curr_active;
7237   int system_active;
7238 
7239   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7240                 set_nproc));
7241   KMP_DEBUG_ASSERT(root);
7242   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7243                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7244   KMP_DEBUG_ASSERT(set_nproc > 1);
7245 
7246   if (set_nproc == 1) {
7247     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7248     return 1;
7249   }
7250 
7251   // Threads that are active in the thread pool, active in the hot team for this
7252   // particular root (if we are at the outer par level), and the currently
7253   // executing thread (to become the master) are available to add to the new
7254   // team, but are currently contributing to the system load, and must be
7255   // accounted for.
7256   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7257   hot_team_active = __kmp_active_hot_team_nproc(root);
7258   team_curr_active = pool_active + hot_team_active + 1;
7259 
7260   // Check the system load.
7261   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7262   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7263                 "hot team active = %d\n",
7264                 system_active, pool_active, hot_team_active));
7265 
7266   if (system_active < 0) {
7267     // There was an error reading the necessary info from /proc, so use the
7268     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7269     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7270     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7271     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7272 
7273     // Make this call behave like the thread limit algorithm.
7274     retval = __kmp_avail_proc - __kmp_nth +
7275              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7276     if (retval > set_nproc) {
7277       retval = set_nproc;
7278     }
7279     if (retval < KMP_MIN_NTH) {
7280       retval = KMP_MIN_NTH;
7281     }
7282 
7283     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7284                   retval));
7285     return retval;
7286   }
7287 
7288   // There is a slight delay in the load balance algorithm in detecting new
7289   // running procs. The real system load at this instant should be at least as
7290   // large as the #active omp thread that are available to add to the team.
7291   if (system_active < team_curr_active) {
7292     system_active = team_curr_active;
7293   }
7294   retval = __kmp_avail_proc - system_active + team_curr_active;
7295   if (retval > set_nproc) {
7296     retval = set_nproc;
7297   }
7298   if (retval < KMP_MIN_NTH) {
7299     retval = KMP_MIN_NTH;
7300   }
7301 
7302   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7303   return retval;
7304 } // __kmp_load_balance_nproc()
7305 
7306 #endif /* USE_LOAD_BALANCE */
7307 
7308 /* ------------------------------------------------------------------------ */
7309 
7310 /* NOTE: this is called with the __kmp_init_lock held */
7311 void __kmp_cleanup(void) {
7312   int f;
7313 
7314   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7315 
7316   if (TCR_4(__kmp_init_parallel)) {
7317 #if KMP_HANDLE_SIGNALS
7318     __kmp_remove_signals();
7319 #endif
7320     TCW_4(__kmp_init_parallel, FALSE);
7321   }
7322 
7323   if (TCR_4(__kmp_init_middle)) {
7324 #if KMP_AFFINITY_SUPPORTED
7325     __kmp_affinity_uninitialize();
7326 #endif /* KMP_AFFINITY_SUPPORTED */
7327     __kmp_cleanup_hierarchy();
7328     TCW_4(__kmp_init_middle, FALSE);
7329   }
7330 
7331   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7332 
7333   if (__kmp_init_serial) {
7334     __kmp_runtime_destroy();
7335     __kmp_init_serial = FALSE;
7336   }
7337 
7338   for (f = 0; f < __kmp_threads_capacity; f++) {
7339     if (__kmp_root[f] != NULL) {
7340       __kmp_free(__kmp_root[f]);
7341       __kmp_root[f] = NULL;
7342     }
7343   }
7344   __kmp_free(__kmp_threads);
7345   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7346   // there is no need in freeing __kmp_root.
7347   __kmp_threads = NULL;
7348   __kmp_root = NULL;
7349   __kmp_threads_capacity = 0;
7350 
7351 #if KMP_USE_DYNAMIC_LOCK
7352   __kmp_cleanup_indirect_user_locks();
7353 #else
7354   __kmp_cleanup_user_locks();
7355 #endif
7356 
7357 #if KMP_AFFINITY_SUPPORTED
7358   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7359   __kmp_cpuinfo_file = NULL;
7360 #endif /* KMP_AFFINITY_SUPPORTED */
7361 
7362 #if KMP_USE_ADAPTIVE_LOCKS
7363 #if KMP_DEBUG_ADAPTIVE_LOCKS
7364   __kmp_print_speculative_stats();
7365 #endif
7366 #endif
7367   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7368   __kmp_nested_nth.nth = NULL;
7369   __kmp_nested_nth.size = 0;
7370   __kmp_nested_nth.used = 0;
7371   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7372   __kmp_nested_proc_bind.bind_types = NULL;
7373   __kmp_nested_proc_bind.size = 0;
7374   __kmp_nested_proc_bind.used = 0;
7375 
7376   __kmp_i18n_catclose();
7377 
7378 #if KMP_STATS_ENABLED
7379   __kmp_stats_fini();
7380 #endif
7381 
7382   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7383 }
7384 
7385 /* ------------------------------------------------------------------------ */
7386 
7387 int __kmp_ignore_mppbeg(void) {
7388   char *env;
7389 
7390   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7391     if (__kmp_str_match_false(env))
7392       return FALSE;
7393   }
7394   // By default __kmpc_begin() is no-op.
7395   return TRUE;
7396 }
7397 
7398 int __kmp_ignore_mppend(void) {
7399   char *env;
7400 
7401   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7402     if (__kmp_str_match_false(env))
7403       return FALSE;
7404   }
7405   // By default __kmpc_end() is no-op.
7406   return TRUE;
7407 }
7408 
7409 void __kmp_internal_begin(void) {
7410   int gtid;
7411   kmp_root_t *root;
7412 
7413   /* this is a very important step as it will register new sibling threads
7414      and assign these new uber threads a new gtid */
7415   gtid = __kmp_entry_gtid();
7416   root = __kmp_threads[gtid]->th.th_root;
7417   KMP_ASSERT(KMP_UBER_GTID(gtid));
7418 
7419   if (root->r.r_begin)
7420     return;
7421   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7422   if (root->r.r_begin) {
7423     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7424     return;
7425   }
7426 
7427   root->r.r_begin = TRUE;
7428 
7429   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7430 }
7431 
7432 /* ------------------------------------------------------------------------ */
7433 
7434 void __kmp_user_set_library(enum library_type arg) {
7435   int gtid;
7436   kmp_root_t *root;
7437   kmp_info_t *thread;
7438 
7439   /* first, make sure we are initialized so we can get our gtid */
7440 
7441   gtid = __kmp_entry_gtid();
7442   thread = __kmp_threads[gtid];
7443 
7444   root = thread->th.th_root;
7445 
7446   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7447                 library_serial));
7448   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7449                                   thread */
7450     KMP_WARNING(SetLibraryIncorrectCall);
7451     return;
7452   }
7453 
7454   switch (arg) {
7455   case library_serial:
7456     thread->th.th_set_nproc = 0;
7457     set__nproc(thread, 1);
7458     break;
7459   case library_turnaround:
7460     thread->th.th_set_nproc = 0;
7461     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7462                                            : __kmp_dflt_team_nth_ub);
7463     break;
7464   case library_throughput:
7465     thread->th.th_set_nproc = 0;
7466     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7467                                            : __kmp_dflt_team_nth_ub);
7468     break;
7469   default:
7470     KMP_FATAL(UnknownLibraryType, arg);
7471   }
7472 
7473   __kmp_aux_set_library(arg);
7474 }
7475 
7476 void __kmp_aux_set_stacksize(size_t arg) {
7477   if (!__kmp_init_serial)
7478     __kmp_serial_initialize();
7479 
7480 #if KMP_OS_DARWIN
7481   if (arg & (0x1000 - 1)) {
7482     arg &= ~(0x1000 - 1);
7483     if (arg + 0x1000) /* check for overflow if we round up */
7484       arg += 0x1000;
7485   }
7486 #endif
7487   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7488 
7489   /* only change the default stacksize before the first parallel region */
7490   if (!TCR_4(__kmp_init_parallel)) {
7491     size_t value = arg; /* argument is in bytes */
7492 
7493     if (value < __kmp_sys_min_stksize)
7494       value = __kmp_sys_min_stksize;
7495     else if (value > KMP_MAX_STKSIZE)
7496       value = KMP_MAX_STKSIZE;
7497 
7498     __kmp_stksize = value;
7499 
7500     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7501   }
7502 
7503   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7504 }
7505 
7506 /* set the behaviour of the runtime library */
7507 /* TODO this can cause some odd behaviour with sibling parallelism... */
7508 void __kmp_aux_set_library(enum library_type arg) {
7509   __kmp_library = arg;
7510 
7511   switch (__kmp_library) {
7512   case library_serial: {
7513     KMP_INFORM(LibraryIsSerial);
7514     (void)__kmp_change_library(TRUE);
7515   } break;
7516   case library_turnaround:
7517     (void)__kmp_change_library(TRUE);
7518     break;
7519   case library_throughput:
7520     (void)__kmp_change_library(FALSE);
7521     break;
7522   default:
7523     KMP_FATAL(UnknownLibraryType, arg);
7524   }
7525 }
7526 
7527 /* ------------------------------------------------------------------------ */
7528 
7529 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7530   int blocktime = arg; /* argument is in milliseconds */
7531 #if KMP_USE_MONITOR
7532   int bt_intervals;
7533 #endif
7534   int bt_set;
7535 
7536   __kmp_save_internal_controls(thread);
7537 
7538   /* Normalize and set blocktime for the teams */
7539   if (blocktime < KMP_MIN_BLOCKTIME)
7540     blocktime = KMP_MIN_BLOCKTIME;
7541   else if (blocktime > KMP_MAX_BLOCKTIME)
7542     blocktime = KMP_MAX_BLOCKTIME;
7543 
7544   set__blocktime_team(thread->th.th_team, tid, blocktime);
7545   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7546 
7547 #if KMP_USE_MONITOR
7548   /* Calculate and set blocktime intervals for the teams */
7549   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7550 
7551   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7552   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7553 #endif
7554 
7555   /* Set whether blocktime has been set to "TRUE" */
7556   bt_set = TRUE;
7557 
7558   set__bt_set_team(thread->th.th_team, tid, bt_set);
7559   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7560 #if KMP_USE_MONITOR
7561   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7562                 "bt_intervals=%d, monitor_updates=%d\n",
7563                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7564                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7565                 __kmp_monitor_wakeups));
7566 #else
7567   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7568                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7569                 thread->th.th_team->t.t_id, tid, blocktime));
7570 #endif
7571 }
7572 
7573 void __kmp_aux_set_defaults(char const *str, int len) {
7574   if (!__kmp_init_serial) {
7575     __kmp_serial_initialize();
7576   }
7577   __kmp_env_initialize(str);
7578 
7579   if (__kmp_settings
7580 #if OMP_40_ENABLED
7581       || __kmp_display_env || __kmp_display_env_verbose
7582 #endif // OMP_40_ENABLED
7583       ) {
7584     __kmp_env_print();
7585   }
7586 } // __kmp_aux_set_defaults
7587 
7588 /* ------------------------------------------------------------------------ */
7589 /* internal fast reduction routines */
7590 
7591 PACKED_REDUCTION_METHOD_T
7592 __kmp_determine_reduction_method(
7593     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7594     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7595     kmp_critical_name *lck) {
7596 
7597   // Default reduction method: critical construct ( lck != NULL, like in current
7598   // PAROPT )
7599   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7600   // can be selected by RTL
7601   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7602   // can be selected by RTL
7603   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7604   // among generated by PAROPT.
7605 
7606   PACKED_REDUCTION_METHOD_T retval;
7607 
7608   int team_size;
7609 
7610   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7611   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7612 
7613 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7614   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7615 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7616 
7617   retval = critical_reduce_block;
7618 
7619   // another choice of getting a team size (with 1 dynamic deference) is slower
7620   team_size = __kmp_get_team_num_threads(global_tid);
7621   if (team_size == 1) {
7622 
7623     retval = empty_reduce_block;
7624 
7625   } else {
7626 
7627     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7628     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7629 
7630 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7631 
7632 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7633     KMP_OS_DARWIN
7634 
7635     int teamsize_cutoff = 4;
7636 
7637 #if KMP_MIC_SUPPORTED
7638     if (__kmp_mic_type != non_mic) {
7639       teamsize_cutoff = 8;
7640     }
7641 #endif
7642     if (tree_available) {
7643       if (team_size <= teamsize_cutoff) {
7644         if (atomic_available) {
7645           retval = atomic_reduce_block;
7646         }
7647       } else {
7648         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7649       }
7650     } else if (atomic_available) {
7651       retval = atomic_reduce_block;
7652     }
7653 #else
7654 #error "Unknown or unsupported OS"
7655 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7656 // KMP_OS_DARWIN
7657 
7658 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7659 
7660 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7661 
7662     // basic tuning
7663 
7664     if (atomic_available) {
7665       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7666         retval = atomic_reduce_block;
7667       }
7668     } // otherwise: use critical section
7669 
7670 #elif KMP_OS_DARWIN
7671 
7672     if (atomic_available && (num_vars <= 3)) {
7673       retval = atomic_reduce_block;
7674     } else if (tree_available) {
7675       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7676           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7677         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7678       }
7679     } // otherwise: use critical section
7680 
7681 #else
7682 #error "Unknown or unsupported OS"
7683 #endif
7684 
7685 #else
7686 #error "Unknown or unsupported architecture"
7687 #endif
7688   }
7689 
7690   // KMP_FORCE_REDUCTION
7691 
7692   // If the team is serialized (team_size == 1), ignore the forced reduction
7693   // method and stay with the unsynchronized method (empty_reduce_block)
7694   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7695       team_size != 1) {
7696 
7697     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7698 
7699     int atomic_available, tree_available;
7700 
7701     switch ((forced_retval = __kmp_force_reduction_method)) {
7702     case critical_reduce_block:
7703       KMP_ASSERT(lck); // lck should be != 0
7704       break;
7705 
7706     case atomic_reduce_block:
7707       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7708       if (!atomic_available) {
7709         KMP_WARNING(RedMethodNotSupported, "atomic");
7710         forced_retval = critical_reduce_block;
7711       }
7712       break;
7713 
7714     case tree_reduce_block:
7715       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7716       if (!tree_available) {
7717         KMP_WARNING(RedMethodNotSupported, "tree");
7718         forced_retval = critical_reduce_block;
7719       } else {
7720 #if KMP_FAST_REDUCTION_BARRIER
7721         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7722 #endif
7723       }
7724       break;
7725 
7726     default:
7727       KMP_ASSERT(0); // "unsupported method specified"
7728     }
7729 
7730     retval = forced_retval;
7731   }
7732 
7733   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7734 
7735 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7736 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7737 
7738   return (retval);
7739 }
7740 
7741 // this function is for testing set/get/determine reduce method
7742 kmp_int32 __kmp_get_reduce_method(void) {
7743   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7744 }
7745