1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nWish, int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376     ompt_lw_taskteam_t lw_taskteam;
1377     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378                             &ompt_parallel_data, codeptr);
1379 
1380     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381     // don't use lw_taskteam after linking. content was swaped
1382 
1383     /* OMPT implicit task begin */
1384     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385     if (ompt_enabled.ompt_callback_implicit_task) {
1386       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389     }
1390 
1391     /* OMPT state */
1392     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1393     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1394   }
1395 #endif
1396 }
1397 
1398 /* most of the work for a fork */
1399 /* return true if we really went parallel, false if serialized */
1400 int __kmp_fork_call(ident_t *loc, int gtid,
1401                     enum fork_context_e call_context, // Intel, GNU, ...
1402                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1403 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1404 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1405                     va_list *ap
1406 #else
1407                     va_list ap
1408 #endif
1409                     ) {
1410   void **argv;
1411   int i;
1412   int master_tid;
1413   int master_this_cons;
1414   kmp_team_t *team;
1415   kmp_team_t *parent_team;
1416   kmp_info_t *master_th;
1417   kmp_root_t *root;
1418   int nthreads;
1419   int master_active;
1420   int master_set_numthreads;
1421   int level;
1422 #if OMP_40_ENABLED
1423   int active_level;
1424   int teams_level;
1425 #endif
1426 #if KMP_NESTED_HOT_TEAMS
1427   kmp_hot_team_ptr_t **p_hot_teams;
1428 #endif
1429   { // KMP_TIME_BLOCK
1430     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1431     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1432 
1433     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1434     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1435       /* Some systems prefer the stack for the root thread(s) to start with */
1436       /* some gap from the parent stack to prevent false sharing. */
1437       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1438       /* These 2 lines below are so this does not get optimized out */
1439       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1440         __kmp_stkpadding += (short)((kmp_int64)dummy);
1441     }
1442 
1443     /* initialize if needed */
1444     KMP_DEBUG_ASSERT(
1445         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1446     if (!TCR_4(__kmp_init_parallel))
1447       __kmp_parallel_initialize();
1448 
1449     /* setup current data */
1450     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1451     // shutdown
1452     parent_team = master_th->th.th_team;
1453     master_tid = master_th->th.th_info.ds.ds_tid;
1454     master_this_cons = master_th->th.th_local.this_construct;
1455     root = master_th->th.th_root;
1456     master_active = root->r.r_active;
1457     master_set_numthreads = master_th->th.th_set_nproc;
1458 
1459 #if OMPT_SUPPORT
1460     ompt_data_t ompt_parallel_data;
1461     ompt_parallel_data.ptr = NULL;
1462     ompt_data_t *parent_task_data;
1463     ompt_frame_t *ompt_frame;
1464     ompt_data_t *implicit_task_data;
1465     void *return_address = NULL;
1466 
1467     if (ompt_enabled.enabled) {
1468       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1469                                     NULL, NULL);
1470       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1471     }
1472 #endif
1473 
1474     // Nested level will be an index in the nested nthreads array
1475     level = parent_team->t.t_level;
1476     // used to launch non-serial teams even if nested is not allowed
1477     active_level = parent_team->t.t_active_level;
1478 #if OMP_40_ENABLED
1479     // needed to check nesting inside the teams
1480     teams_level = master_th->th.th_teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483     p_hot_teams = &master_th->th.th_hot_teams;
1484     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1485       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1486           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1487       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1488       // it is either actual or not needed (when active_level > 0)
1489       (*p_hot_teams)[0].hot_team_nth = 1;
1490     }
1491 #endif
1492 
1493 #if OMPT_SUPPORT
1494     if (ompt_enabled.enabled) {
1495       if (ompt_enabled.ompt_callback_parallel_begin) {
1496         int team_size = master_set_numthreads
1497                             ? master_set_numthreads
1498                             : get__nproc_2(parent_team, master_tid);
1499         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1500             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1501             OMPT_INVOKER(call_context), return_address);
1502       }
1503       master_th->th.ompt_thread_info.state = omp_state_overhead;
1504     }
1505 #endif
1506 
1507     master_th->th.th_ident = loc;
1508 
1509 #if OMP_40_ENABLED
1510     if (master_th->th.th_teams_microtask && ap &&
1511         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1512       // AC: This is start of parallel that is nested inside teams construct.
1513       // The team is actual (hot), all workers are ready at the fork barrier.
1514       // No lock needed to initialize the team a bit, then free workers.
1515       parent_team->t.t_ident = loc;
1516       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1517       parent_team->t.t_argc = argc;
1518       argv = (void **)parent_team->t.t_argv;
1519       for (i = argc - 1; i >= 0; --i)
1520 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1521 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1522         *argv++ = va_arg(*ap, void *);
1523 #else
1524         *argv++ = va_arg(ap, void *);
1525 #endif
1526       // Increment our nested depth levels, but not increase the serialization
1527       if (parent_team == master_th->th.th_serial_team) {
1528         // AC: we are in serialized parallel
1529         __kmpc_serialized_parallel(loc, gtid);
1530         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1531         // AC: need this in order enquiry functions work
1532         // correctly, will restore at join time
1533         parent_team->t.t_serialized--;
1534 #if OMPT_SUPPORT
1535         void *dummy;
1536         void **exit_runtime_p;
1537 
1538         ompt_lw_taskteam_t lw_taskteam;
1539 
1540         if (ompt_enabled.enabled) {
1541           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1542                                   &ompt_parallel_data, return_address);
1543           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1544 
1545           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1546           // don't use lw_taskteam after linking. content was swaped
1547 
1548           /* OMPT implicit task begin */
1549           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1550           if (ompt_enabled.ompt_callback_implicit_task) {
1551             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1552                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1553                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1554           }
1555 
1556           /* OMPT state */
1557           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1558         } else {
1559           exit_runtime_p = &dummy;
1560         }
1561 #endif
1562 
1563         {
1564           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1565           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1566           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1567 #if OMPT_SUPPORT
1568                                  ,
1569                                  exit_runtime_p
1570 #endif
1571                                  );
1572         }
1573 
1574 #if OMPT_SUPPORT
1575         *exit_runtime_p = NULL;
1576         if (ompt_enabled.enabled) {
1577           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1578           if (ompt_enabled.ompt_callback_implicit_task) {
1579             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1580                 ompt_scope_end, NULL, implicit_task_data, 1,
1581                 __kmp_tid_from_gtid(gtid));
1582           }
1583           __ompt_lw_taskteam_unlink(master_th);
1584 
1585           if (ompt_enabled.ompt_callback_parallel_end) {
1586             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1587                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1588                 OMPT_INVOKER(call_context), return_address);
1589           }
1590           master_th->th.ompt_thread_info.state = omp_state_overhead;
1591         }
1592 #endif
1593         return TRUE;
1594       }
1595 
1596       parent_team->t.t_pkfn = microtask;
1597       parent_team->t.t_invoke = invoker;
1598       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1599       parent_team->t.t_active_level++;
1600       parent_team->t.t_level++;
1601 
1602       /* Change number of threads in the team if requested */
1603       if (master_set_numthreads) { // The parallel has num_threads clause
1604         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1605           // AC: only can reduce number of threads dynamically, can't increase
1606           kmp_info_t **other_threads = parent_team->t.t_threads;
1607           parent_team->t.t_nproc = master_set_numthreads;
1608           for (i = 0; i < master_set_numthreads; ++i) {
1609             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1610           }
1611           // Keep extra threads hot in the team for possible next parallels
1612         }
1613         master_th->th.th_set_nproc = 0;
1614       }
1615 
1616 #if USE_DEBUGGER
1617       if (__kmp_debugging) { // Let debugger override number of threads.
1618         int nth = __kmp_omp_num_threads(loc);
1619         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1620           master_set_numthreads = nth;
1621         }
1622       }
1623 #endif
1624 
1625       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1626                     "master_th=%p, gtid=%d\n",
1627                     root, parent_team, master_th, gtid));
1628       __kmp_internal_fork(loc, gtid, parent_team);
1629       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1630                     "master_th=%p, gtid=%d\n",
1631                     root, parent_team, master_th, gtid));
1632 
1633       /* Invoke microtask for MASTER thread */
1634       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1635                     parent_team->t.t_id, parent_team->t.t_pkfn));
1636 
1637       {
1638         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1639         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1640         if (!parent_team->t.t_invoke(gtid)) {
1641           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642         }
1643       }
1644       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1645                     parent_team->t.t_id, parent_team->t.t_pkfn));
1646       KMP_MB(); /* Flush all pending memory write invalidates.  */
1647 
1648       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1649 
1650       return TRUE;
1651     } // Parallel closely nested in teams construct
1652 #endif /* OMP_40_ENABLED */
1653 
1654 #if KMP_DEBUG
1655     if (__kmp_tasking_mode != tskm_immediate_exec) {
1656       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1657                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1658     }
1659 #endif
1660 
1661     if (parent_team->t.t_active_level >=
1662         master_th->th.th_current_task->td_icvs.max_active_levels) {
1663       nthreads = 1;
1664     } else {
1665 #if OMP_40_ENABLED
1666       int enter_teams = ((ap == NULL && active_level == 0) ||
1667                          (ap && teams_level > 0 && teams_level == level));
1668 #endif
1669       nthreads =
1670           master_set_numthreads
1671               ? master_set_numthreads
1672               : get__nproc_2(
1673                     parent_team,
1674                     master_tid); // TODO: get nproc directly from current task
1675 
1676       // Check if we need to take forkjoin lock? (no need for serialized
1677       // parallel out of teams construct). This code moved here from
1678       // __kmp_reserve_threads() to speedup nested serialized parallels.
1679       if (nthreads > 1) {
1680         if ((!get__nested(master_th) && (root->r.r_in_parallel
1681 #if OMP_40_ENABLED
1682                                          && !enter_teams
1683 #endif /* OMP_40_ENABLED */
1684                                          )) ||
1685             (__kmp_library == library_serial)) {
1686           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1687                         " threads\n",
1688                         gtid, nthreads));
1689           nthreads = 1;
1690         }
1691       }
1692       if (nthreads > 1) {
1693         /* determine how many new threads we can use */
1694         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1695         nthreads = __kmp_reserve_threads(
1696             root, parent_team, master_tid, nthreads
1697 #if OMP_40_ENABLED
1698             /* AC: If we execute teams from parallel region (on host), then
1699                teams should be created but each can only have 1 thread if
1700                nesting is disabled. If teams called from serial region, then
1701                teams and their threads should be created regardless of the
1702                nesting setting. */
1703             ,
1704             enter_teams
1705 #endif /* OMP_40_ENABLED */
1706             );
1707         if (nthreads == 1) {
1708           // Free lock for single thread execution here; for multi-thread
1709           // execution it will be freed later after team of threads created
1710           // and initialized
1711           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1712         }
1713       }
1714     }
1715     KMP_DEBUG_ASSERT(nthreads > 0);
1716 
1717     // If we temporarily changed the set number of threads then restore it now
1718     master_th->th.th_set_nproc = 0;
1719 
1720     /* create a serialized parallel region? */
1721     if (nthreads == 1) {
1722 /* josh todo: hypothetical question: what do we do for OS X*? */
1723 #if KMP_OS_LINUX &&                                                            \
1724     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1725       void *args[argc];
1726 #else
1727       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1728 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1729           KMP_ARCH_AARCH64) */
1730 
1731       KA_TRACE(20,
1732                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1733 
1734       __kmpc_serialized_parallel(loc, gtid);
1735 
1736       if (call_context == fork_context_intel) {
1737         /* TODO this sucks, use the compiler itself to pass args! :) */
1738         master_th->th.th_serial_team->t.t_ident = loc;
1739 #if OMP_40_ENABLED
1740         if (!ap) {
1741           // revert change made in __kmpc_serialized_parallel()
1742           master_th->th.th_serial_team->t.t_level--;
1743 // Get args from parent team for teams construct
1744 
1745 #if OMPT_SUPPORT
1746           void *dummy;
1747           void **exit_runtime_p;
1748           ompt_task_info_t *task_info;
1749 
1750           ompt_lw_taskteam_t lw_taskteam;
1751 
1752           if (ompt_enabled.enabled) {
1753             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1754                                     &ompt_parallel_data, return_address);
1755 
1756             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1757             // don't use lw_taskteam after linking. content was swaped
1758 
1759             task_info = OMPT_CUR_TASK_INFO(master_th);
1760             exit_runtime_p = &(task_info->frame.exit_frame);
1761             if (ompt_enabled.ompt_callback_implicit_task) {
1762               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1764                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1765             }
1766 
1767             /* OMPT state */
1768             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1769           } else {
1770             exit_runtime_p = &dummy;
1771           }
1772 #endif
1773 
1774           {
1775             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1776             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1777             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1778                                    parent_team->t.t_argv
1779 #if OMPT_SUPPORT
1780                                    ,
1781                                    exit_runtime_p
1782 #endif
1783                                    );
1784           }
1785 
1786 #if OMPT_SUPPORT
1787           if (ompt_enabled.enabled) {
1788             exit_runtime_p = NULL;
1789             if (ompt_enabled.ompt_callback_implicit_task) {
1790               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1792                   __kmp_tid_from_gtid(gtid));
1793             }
1794 
1795             __ompt_lw_taskteam_unlink(master_th);
1796             if (ompt_enabled.ompt_callback_parallel_end) {
1797               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1798                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1799                   OMPT_INVOKER(call_context), return_address);
1800             }
1801             master_th->th.ompt_thread_info.state = omp_state_overhead;
1802           }
1803 #endif
1804         } else if (microtask == (microtask_t)__kmp_teams_master) {
1805           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1806                            master_th->th.th_serial_team);
1807           team = master_th->th.th_team;
1808           // team->t.t_pkfn = microtask;
1809           team->t.t_invoke = invoker;
1810           __kmp_alloc_argv_entries(argc, team, TRUE);
1811           team->t.t_argc = argc;
1812           argv = (void **)team->t.t_argv;
1813           if (ap) {
1814             for (i = argc - 1; i >= 0; --i)
1815 // TODO: revert workaround for Intel(R) 64 tracker #96
1816 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1817               *argv++ = va_arg(*ap, void *);
1818 #else
1819               *argv++ = va_arg(ap, void *);
1820 #endif
1821           } else {
1822             for (i = 0; i < argc; ++i)
1823               // Get args from parent team for teams construct
1824               argv[i] = parent_team->t.t_argv[i];
1825           }
1826           // AC: revert change made in __kmpc_serialized_parallel()
1827           //     because initial code in teams should have level=0
1828           team->t.t_level--;
1829           // AC: call special invoker for outer "parallel" of teams construct
1830           {
1831             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1832             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1833             invoker(gtid);
1834           }
1835         } else {
1836 #endif /* OMP_40_ENABLED */
1837           argv = args;
1838           for (i = argc - 1; i >= 0; --i)
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841             *argv++ = va_arg(*ap, void *);
1842 #else
1843           *argv++ = va_arg(ap, void *);
1844 #endif
1845           KMP_MB();
1846 
1847 #if OMPT_SUPPORT
1848           void *dummy;
1849           void **exit_runtime_p;
1850           ompt_task_info_t *task_info;
1851 
1852           ompt_lw_taskteam_t lw_taskteam;
1853 
1854           if (ompt_enabled.enabled) {
1855             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1856                                     &ompt_parallel_data, return_address);
1857             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1858             // don't use lw_taskteam after linking. content was swaped
1859             task_info = OMPT_CUR_TASK_INFO(master_th);
1860             exit_runtime_p = &(task_info->frame.exit_frame);
1861 
1862             /* OMPT implicit task begin */
1863             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1864             if (ompt_enabled.ompt_callback_implicit_task) {
1865               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1866                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1867                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1868             }
1869 
1870             /* OMPT state */
1871             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1872           } else {
1873             exit_runtime_p = &dummy;
1874           }
1875 #endif
1876 
1877           {
1878             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882                                    ,
1883                                    exit_runtime_p
1884 #endif
1885                                    );
1886           }
1887 
1888 #if OMPT_SUPPORT
1889           if (ompt_enabled.enabled) {
1890             *exit_runtime_p = NULL;
1891             if (ompt_enabled.ompt_callback_implicit_task) {
1892               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1894                   __kmp_tid_from_gtid(gtid));
1895             }
1896 
1897             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898             __ompt_lw_taskteam_unlink(master_th);
1899             if (ompt_enabled.ompt_callback_parallel_end) {
1900               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901                   &ompt_parallel_data, parent_task_data,
1902                   OMPT_INVOKER(call_context), return_address);
1903             }
1904             master_th->th.ompt_thread_info.state = omp_state_overhead;
1905           }
1906 #endif
1907 #if OMP_40_ENABLED
1908         }
1909 #endif /* OMP_40_ENABLED */
1910       } else if (call_context == fork_context_gnu) {
1911 #if OMPT_SUPPORT
1912         ompt_lw_taskteam_t lwt;
1913         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1914                                 return_address);
1915 
1916         lwt.ompt_task_info.frame.exit_frame = NULL;
1917         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921         // we were called from GNU native code
1922         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1923         return FALSE;
1924       } else {
1925         KMP_ASSERT2(call_context < fork_context_last,
1926                     "__kmp_fork_call: unknown fork_context parameter");
1927       }
1928 
1929       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930       KMP_MB();
1931       return FALSE;
1932     }
1933 
1934     // GEH: only modify the executing flag in the case when not serialized
1935     //      serialized case is handled in kmpc_serialized_parallel
1936     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1937                   "curtask=%p, curtask_max_aclevel=%d\n",
1938                   parent_team->t.t_active_level, master_th,
1939                   master_th->th.th_current_task,
1940                   master_th->th.th_current_task->td_icvs.max_active_levels));
1941     // TODO: GEH - cannot do this assertion because root thread not set up as
1942     // executing
1943     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1944     master_th->th.th_current_task->td_flags.executing = 0;
1945 
1946 #if OMP_40_ENABLED
1947     if (!master_th->th.th_teams_microtask || level > teams_level)
1948 #endif /* OMP_40_ENABLED */
1949     {
1950       /* Increment our nested depth level */
1951       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1952     }
1953 
1954     // See if we need to make a copy of the ICVs.
1955     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1956     if ((level + 1 < __kmp_nested_nth.used) &&
1957         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1958       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1959     } else {
1960       nthreads_icv = 0; // don't update
1961     }
1962 
1963 #if OMP_40_ENABLED
1964     // Figure out the proc_bind_policy for the new team.
1965     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1966     kmp_proc_bind_t proc_bind_icv =
1967         proc_bind_default; // proc_bind_default means don't update
1968     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1969       proc_bind = proc_bind_false;
1970     } else {
1971       if (proc_bind == proc_bind_default) {
1972         // No proc_bind clause specified; use current proc-bind-var for this
1973         // parallel region
1974         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1975       }
1976       /* else: The proc_bind policy was specified explicitly on parallel clause.
1977          This overrides proc-bind-var for this parallel region, but does not
1978          change proc-bind-var. */
1979       // Figure the value of proc-bind-var for the child threads.
1980       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1981           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1982            master_th->th.th_current_task->td_icvs.proc_bind)) {
1983         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1984       }
1985     }
1986 
1987     // Reset for next parallel region
1988     master_th->th.th_set_proc_bind = proc_bind_default;
1989 #endif /* OMP_40_ENABLED */
1990 
1991     if ((nthreads_icv > 0)
1992 #if OMP_40_ENABLED
1993         || (proc_bind_icv != proc_bind_default)
1994 #endif /* OMP_40_ENABLED */
1995             ) {
1996       kmp_internal_control_t new_icvs;
1997       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1998       new_icvs.next = NULL;
1999       if (nthreads_icv > 0) {
2000         new_icvs.nproc = nthreads_icv;
2001       }
2002 
2003 #if OMP_40_ENABLED
2004       if (proc_bind_icv != proc_bind_default) {
2005         new_icvs.proc_bind = proc_bind_icv;
2006       }
2007 #endif /* OMP_40_ENABLED */
2008 
2009       /* allocate a new parallel team */
2010       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011       team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013                                  ompt_parallel_data,
2014 #endif
2015 #if OMP_40_ENABLED
2016                                  proc_bind,
2017 #endif
2018                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2019     } else {
2020       /* allocate a new parallel team */
2021       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2022       team = __kmp_allocate_team(root, nthreads, nthreads,
2023 #if OMPT_SUPPORT
2024                                  ompt_parallel_data,
2025 #endif
2026 #if OMP_40_ENABLED
2027                                  proc_bind,
2028 #endif
2029                                  &master_th->th.th_current_task->td_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     }
2032     KF_TRACE(
2033         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2034 
2035     /* setup the new team */
2036     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2037     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2038     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2039     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2040     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2041 #if OMPT_SUPPORT
2042     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2043                           return_address);
2044 #endif
2045     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2046 // TODO: parent_team->t.t_level == INT_MAX ???
2047 #if OMP_40_ENABLED
2048     if (!master_th->th.th_teams_microtask || level > teams_level) {
2049 #endif /* OMP_40_ENABLED */
2050       int new_level = parent_team->t.t_level + 1;
2051       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052       new_level = parent_team->t.t_active_level + 1;
2053       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054 #if OMP_40_ENABLED
2055     } else {
2056       // AC: Do not increase parallel level at start of the teams construct
2057       int new_level = parent_team->t.t_level;
2058       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2059       new_level = parent_team->t.t_active_level;
2060       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2061     }
2062 #endif /* OMP_40_ENABLED */
2063     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2064     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
2065         team->t.t_sched.chunk != new_sched.chunk)
2066       team->t.t_sched =
2067           new_sched; // set master's schedule as new run-time schedule
2068 
2069 #if OMP_40_ENABLED
2070     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2071 #endif
2072 
2073     // Update the floating point rounding in the team if required.
2074     propagateFPControl(team);
2075 
2076     if (__kmp_tasking_mode != tskm_immediate_exec) {
2077       // Set master's task team to team's task team. Unless this is hot team, it
2078       // should be NULL.
2079       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2080                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2081       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2082                     "%p, new task_team %p / team %p\n",
2083                     __kmp_gtid_from_thread(master_th),
2084                     master_th->th.th_task_team, parent_team,
2085                     team->t.t_task_team[master_th->th.th_task_state], team));
2086 
2087       if (active_level || master_th->th.th_task_team) {
2088         // Take a memo of master's task_state
2089         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2090         if (master_th->th.th_task_state_top >=
2091             master_th->th.th_task_state_stack_sz) { // increase size
2092           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2093           kmp_uint8 *old_stack, *new_stack;
2094           kmp_uint32 i;
2095           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2096           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2097             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2098           }
2099           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2100                ++i) { // zero-init rest of stack
2101             new_stack[i] = 0;
2102           }
2103           old_stack = master_th->th.th_task_state_memo_stack;
2104           master_th->th.th_task_state_memo_stack = new_stack;
2105           master_th->th.th_task_state_stack_sz = new_size;
2106           __kmp_free(old_stack);
2107         }
2108         // Store master's task_state on stack
2109         master_th->th
2110             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2111             master_th->th.th_task_state;
2112         master_th->th.th_task_state_top++;
2113 #if KMP_NESTED_HOT_TEAMS
2114         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2115           // Restore master's nested state if nested hot team
2116           master_th->th.th_task_state =
2117               master_th->th
2118                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119         } else {
2120 #endif
2121           master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123         }
2124 #endif
2125       }
2126 #if !KMP_NESTED_HOT_TEAMS
2127       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128                        (team == root->r.r_hot_team));
2129 #endif
2130     }
2131 
2132     KA_TRACE(
2133         20,
2134         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136          team->t.t_nproc));
2137     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138                      (team->t.t_master_tid == 0 &&
2139                       (team->t.t_parent == root->r.r_root_team ||
2140                        team->t.t_parent->t.t_serialized)));
2141     KMP_MB();
2142 
2143     /* now, setup the arguments */
2144     argv = (void **)team->t.t_argv;
2145 #if OMP_40_ENABLED
2146     if (ap) {
2147 #endif /* OMP_40_ENABLED */
2148       for (i = argc - 1; i >= 0; --i) {
2149 // TODO: revert workaround for Intel(R) 64 tracker #96
2150 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2151         void *new_argv = va_arg(*ap, void *);
2152 #else
2153       void *new_argv = va_arg(ap, void *);
2154 #endif
2155         KMP_CHECK_UPDATE(*argv, new_argv);
2156         argv++;
2157       }
2158 #if OMP_40_ENABLED
2159     } else {
2160       for (i = 0; i < argc; ++i) {
2161         // Get args from parent team for teams construct
2162         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2163       }
2164     }
2165 #endif /* OMP_40_ENABLED */
2166 
2167     /* now actually fork the threads */
2168     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2169     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2170       root->r.r_active = TRUE;
2171 
2172     __kmp_fork_team_threads(root, team, master_th, gtid);
2173     __kmp_setup_icv_copy(team, nthreads,
2174                          &master_th->th.th_current_task->td_icvs, loc);
2175 
2176 #if OMPT_SUPPORT
2177     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2178 #endif
2179 
2180     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2181 
2182 #if USE_ITT_BUILD
2183     if (team->t.t_active_level == 1 // only report frames at level 1
2184 #if OMP_40_ENABLED
2185         && !master_th->th.th_teams_microtask // not in teams construct
2186 #endif /* OMP_40_ENABLED */
2187         ) {
2188 #if USE_ITT_NOTIFY
2189       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2190           (__kmp_forkjoin_frames_mode == 3 ||
2191            __kmp_forkjoin_frames_mode == 1)) {
2192         kmp_uint64 tmp_time = 0;
2193         if (__itt_get_timestamp_ptr)
2194           tmp_time = __itt_get_timestamp();
2195         // Internal fork - report frame begin
2196         master_th->th.th_frame_time = tmp_time;
2197         if (__kmp_forkjoin_frames_mode == 3)
2198           team->t.t_region_time = tmp_time;
2199       } else
2200 // only one notification scheme (either "submit" or "forking/joined", not both)
2201 #endif /* USE_ITT_NOTIFY */
2202           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2203               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2204         // Mark start of "parallel" region for VTune.
2205         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2206       }
2207     }
2208 #endif /* USE_ITT_BUILD */
2209 
2210     /* now go on and do the work */
2211     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2212     KMP_MB();
2213     KF_TRACE(10,
2214              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2215               root, team, master_th, gtid));
2216 
2217 #if USE_ITT_BUILD
2218     if (__itt_stack_caller_create_ptr) {
2219       team->t.t_stack_id =
2220           __kmp_itt_stack_caller_create(); // create new stack stitching id
2221       // before entering fork barrier
2222     }
2223 #endif /* USE_ITT_BUILD */
2224 
2225 #if OMP_40_ENABLED
2226     // AC: skip __kmp_internal_fork at teams construct, let only master
2227     // threads execute
2228     if (ap)
2229 #endif /* OMP_40_ENABLED */
2230     {
2231       __kmp_internal_fork(loc, gtid, team);
2232       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2233                     "master_th=%p, gtid=%d\n",
2234                     root, team, master_th, gtid));
2235     }
2236 
2237     if (call_context == fork_context_gnu) {
2238       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2239       return TRUE;
2240     }
2241 
2242     /* Invoke microtask for MASTER thread */
2243     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2244                   team->t.t_id, team->t.t_pkfn));
2245   } // END of timer KMP_fork_call block
2246 
2247   {
2248     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2249     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2250     if (!team->t.t_invoke(gtid)) {
2251       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2252     }
2253   }
2254   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2255                 team->t.t_id, team->t.t_pkfn));
2256   KMP_MB(); /* Flush all pending memory write invalidates.  */
2257 
2258   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2259 
2260 #if OMPT_SUPPORT
2261   if (ompt_enabled.enabled) {
2262     master_th->th.ompt_thread_info.state = omp_state_overhead;
2263   }
2264 #endif
2265 
2266   return TRUE;
2267 }
2268 
2269 #if OMPT_SUPPORT
2270 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2271                                             kmp_team_t *team) {
2272   // restore state outside the region
2273   thread->th.ompt_thread_info.state =
2274       ((team->t.t_serialized) ? omp_state_work_serial
2275                               : omp_state_work_parallel);
2276 }
2277 
2278 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2279                                    kmp_team_t *team, ompt_data_t *parallel_data,
2280                                    fork_context_e fork_context, void *codeptr) {
2281   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2282   if (ompt_enabled.ompt_callback_parallel_end) {
2283     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2284         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2285         codeptr);
2286   }
2287 
2288   task_info->frame.enter_frame = NULL;
2289   __kmp_join_restore_state(thread, team);
2290 }
2291 #endif
2292 
2293 void __kmp_join_call(ident_t *loc, int gtid
2294 #if OMPT_SUPPORT
2295                      ,
2296                      enum fork_context_e fork_context
2297 #endif
2298 #if OMP_40_ENABLED
2299                      ,
2300                      int exit_teams
2301 #endif /* OMP_40_ENABLED */
2302                      ) {
2303   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2304   kmp_team_t *team;
2305   kmp_team_t *parent_team;
2306   kmp_info_t *master_th;
2307   kmp_root_t *root;
2308   int master_active;
2309   int i;
2310 
2311   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2312 
2313   /* setup current data */
2314   master_th = __kmp_threads[gtid];
2315   root = master_th->th.th_root;
2316   team = master_th->th.th_team;
2317   parent_team = team->t.t_parent;
2318 
2319   master_th->th.th_ident = loc;
2320 
2321 #if OMPT_SUPPORT
2322   if (ompt_enabled.enabled) {
2323     master_th->th.ompt_thread_info.state = omp_state_overhead;
2324   }
2325 #endif
2326 
2327 #if KMP_DEBUG
2328   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2329     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2330                   "th_task_team = %p\n",
2331                   __kmp_gtid_from_thread(master_th), team,
2332                   team->t.t_task_team[master_th->th.th_task_state],
2333                   master_th->th.th_task_team));
2334     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2335                      team->t.t_task_team[master_th->th.th_task_state]);
2336   }
2337 #endif
2338 
2339   if (team->t.t_serialized) {
2340 #if OMP_40_ENABLED
2341     if (master_th->th.th_teams_microtask) {
2342       // We are in teams construct
2343       int level = team->t.t_level;
2344       int tlevel = master_th->th.th_teams_level;
2345       if (level == tlevel) {
2346         // AC: we haven't incremented it earlier at start of teams construct,
2347         //     so do it here - at the end of teams construct
2348         team->t.t_level++;
2349       } else if (level == tlevel + 1) {
2350         // AC: we are exiting parallel inside teams, need to increment
2351         // serialization in order to restore it in the next call to
2352         // __kmpc_end_serialized_parallel
2353         team->t.t_serialized++;
2354       }
2355     }
2356 #endif /* OMP_40_ENABLED */
2357     __kmpc_end_serialized_parallel(loc, gtid);
2358 
2359 #if OMPT_SUPPORT
2360     if (ompt_enabled.enabled) {
2361       __kmp_join_restore_state(master_th, parent_team);
2362     }
2363 #endif
2364 
2365     return;
2366   }
2367 
2368   master_active = team->t.t_master_active;
2369 
2370 #if OMP_40_ENABLED
2371   if (!exit_teams)
2372 #endif /* OMP_40_ENABLED */
2373   {
2374     // AC: No barrier for internal teams at exit from teams construct.
2375     //     But there is barrier for external team (league).
2376     __kmp_internal_join(loc, gtid, team);
2377   }
2378 #if OMP_40_ENABLED
2379   else {
2380     master_th->th.th_task_state =
2381         0; // AC: no tasking in teams (out of any parallel)
2382   }
2383 #endif /* OMP_40_ENABLED */
2384 
2385   KMP_MB();
2386 
2387 #if OMPT_SUPPORT
2388   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2389   void *codeptr = team->t.ompt_team_info.master_return_address;
2390 #endif
2391 
2392 #if USE_ITT_BUILD
2393   if (__itt_stack_caller_create_ptr) {
2394     __kmp_itt_stack_caller_destroy(
2395         (__itt_caller)team->t
2396             .t_stack_id); // destroy the stack stitching id after join barrier
2397   }
2398 
2399   // Mark end of "parallel" region for VTune.
2400   if (team->t.t_active_level == 1
2401 #if OMP_40_ENABLED
2402       && !master_th->th.th_teams_microtask /* not in teams construct */
2403 #endif /* OMP_40_ENABLED */
2404       ) {
2405     master_th->th.th_ident = loc;
2406     // only one notification scheme (either "submit" or "forking/joined", not
2407     // both)
2408     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2409         __kmp_forkjoin_frames_mode == 3)
2410       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2411                              master_th->th.th_frame_time, 0, loc,
2412                              master_th->th.th_team_nproc, 1);
2413     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2414              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2415       __kmp_itt_region_joined(gtid);
2416   } // active_level == 1
2417 #endif /* USE_ITT_BUILD */
2418 
2419 #if OMP_40_ENABLED
2420   if (master_th->th.th_teams_microtask && !exit_teams &&
2421       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2422       team->t.t_level == master_th->th.th_teams_level + 1) {
2423     // AC: We need to leave the team structure intact at the end of parallel
2424     // inside the teams construct, so that at the next parallel same (hot) team
2425     // works, only adjust nesting levels
2426 
2427     /* Decrement our nested depth level */
2428     team->t.t_level--;
2429     team->t.t_active_level--;
2430     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2431 
2432     /* Restore number of threads in the team if needed */
2433     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2434       int old_num = master_th->th.th_team_nproc;
2435       int new_num = master_th->th.th_teams_size.nth;
2436       kmp_info_t **other_threads = team->t.t_threads;
2437       team->t.t_nproc = new_num;
2438       for (i = 0; i < old_num; ++i) {
2439         other_threads[i]->th.th_team_nproc = new_num;
2440       }
2441       // Adjust states of non-used threads of the team
2442       for (i = old_num; i < new_num; ++i) {
2443         // Re-initialize thread's barrier data.
2444         int b;
2445         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2446         for (b = 0; b < bs_last_barrier; ++b) {
2447           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2448           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2449 #if USE_DEBUGGER
2450           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2451 #endif
2452         }
2453         if (__kmp_tasking_mode != tskm_immediate_exec) {
2454           // Synchronize thread's task state
2455           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2456         }
2457       }
2458     }
2459 
2460 #if OMPT_SUPPORT
2461     if (ompt_enabled.enabled) {
2462       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2463                       codeptr);
2464     }
2465 #endif
2466 
2467     return;
2468   }
2469 #endif /* OMP_40_ENABLED */
2470 
2471   /* do cleanup and restore the parent team */
2472   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2473   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2474 
2475   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2476 
2477   /* jc: The following lock has instructions with REL and ACQ semantics,
2478      separating the parallel user code called in this parallel region
2479      from the serial user code called after this function returns. */
2480   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2481 
2482 #if OMP_40_ENABLED
2483   if (!master_th->th.th_teams_microtask ||
2484       team->t.t_level > master_th->th.th_teams_level)
2485 #endif /* OMP_40_ENABLED */
2486   {
2487     /* Decrement our nested depth level */
2488     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2489   }
2490   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2491 
2492 #if OMPT_SUPPORT
2493   if (ompt_enabled.enabled) {
2494     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2495     if (ompt_enabled.ompt_callback_implicit_task) {
2496       int ompt_team_size = team->t.t_nproc;
2497       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2498           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2499           __kmp_tid_from_gtid(gtid));
2500     }
2501 
2502     task_info->frame.exit_frame = NULL;
2503     task_info->task_data = ompt_data_none;
2504   }
2505 #endif
2506 
2507   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508                 master_th, team));
2509   __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2512   // Restore master thread's partition.
2513   master_th->th.th_first_place = team->t.t_first_place;
2514   master_th->th.th_last_place = team->t.t_last_place;
2515 #endif /* OMP_40_ENABLED */
2516 
2517   updateHWFPControl(team);
2518 
2519   if (root->r.r_active != master_active)
2520     root->r.r_active = master_active;
2521 
2522   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2523                             master_th)); // this will free worker threads
2524 
2525   /* this race was fun to find. make sure the following is in the critical
2526      region otherwise assertions may fail occasionally since the old team may be
2527      reallocated and the hierarchy appears inconsistent. it is actually safe to
2528      run and won't cause any bugs, but will cause those assertion failures. it's
2529      only one deref&assign so might as well put this in the critical region */
2530   master_th->th.th_team = parent_team;
2531   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2532   master_th->th.th_team_master = parent_team->t.t_threads[0];
2533   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2534 
2535   /* restore serialized team, if need be */
2536   if (parent_team->t.t_serialized &&
2537       parent_team != master_th->th.th_serial_team &&
2538       parent_team != root->r.r_root_team) {
2539     __kmp_free_team(root,
2540                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2541     master_th->th.th_serial_team = parent_team;
2542   }
2543 
2544   if (__kmp_tasking_mode != tskm_immediate_exec) {
2545     if (master_th->th.th_task_state_top >
2546         0) { // Restore task state from memo stack
2547       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2548       // Remember master's state if we re-use this nested hot team
2549       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2550           master_th->th.th_task_state;
2551       --master_th->th.th_task_state_top; // pop
2552       // Now restore state at this level
2553       master_th->th.th_task_state =
2554           master_th->th
2555               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2556     }
2557     // Copy the task team from the parent team to the master thread
2558     master_th->th.th_task_team =
2559         parent_team->t.t_task_team[master_th->th.th_task_state];
2560     KA_TRACE(20,
2561              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2562               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2563               parent_team));
2564   }
2565 
2566   // TODO: GEH - cannot do this assertion because root thread not set up as
2567   // executing
2568   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2569   master_th->th.th_current_task->td_flags.executing = 1;
2570 
2571   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2572 
2573 #if OMPT_SUPPORT
2574   if (ompt_enabled.enabled) {
2575     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2576                     codeptr);
2577   }
2578 #endif
2579 
2580   KMP_MB();
2581   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2582 }
2583 
2584 /* Check whether we should push an internal control record onto the
2585    serial team stack.  If so, do it.  */
2586 void __kmp_save_internal_controls(kmp_info_t *thread) {
2587 
2588   if (thread->th.th_team != thread->th.th_serial_team) {
2589     return;
2590   }
2591   if (thread->th.th_team->t.t_serialized > 1) {
2592     int push = 0;
2593 
2594     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2595       push = 1;
2596     } else {
2597       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2598           thread->th.th_team->t.t_serialized) {
2599         push = 1;
2600       }
2601     }
2602     if (push) { /* push a record on the serial team's stack */
2603       kmp_internal_control_t *control =
2604           (kmp_internal_control_t *)__kmp_allocate(
2605               sizeof(kmp_internal_control_t));
2606 
2607       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2608 
2609       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2610 
2611       control->next = thread->th.th_team->t.t_control_stack_top;
2612       thread->th.th_team->t.t_control_stack_top = control;
2613     }
2614   }
2615 }
2616 
2617 /* Changes set_nproc */
2618 void __kmp_set_num_threads(int new_nth, int gtid) {
2619   kmp_info_t *thread;
2620   kmp_root_t *root;
2621 
2622   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2623   KMP_DEBUG_ASSERT(__kmp_init_serial);
2624 
2625   if (new_nth < 1)
2626     new_nth = 1;
2627   else if (new_nth > __kmp_max_nth)
2628     new_nth = __kmp_max_nth;
2629 
2630   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2631   thread = __kmp_threads[gtid];
2632 
2633   __kmp_save_internal_controls(thread);
2634 
2635   set__nproc(thread, new_nth);
2636 
2637   // If this omp_set_num_threads() call will cause the hot team size to be
2638   // reduced (in the absence of a num_threads clause), then reduce it now,
2639   // rather than waiting for the next parallel region.
2640   root = thread->th.th_root;
2641   if (__kmp_init_parallel && (!root->r.r_active) &&
2642       (root->r.r_hot_team->t.t_nproc > new_nth)
2643 #if KMP_NESTED_HOT_TEAMS
2644       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2645 #endif
2646       ) {
2647     kmp_team_t *hot_team = root->r.r_hot_team;
2648     int f;
2649 
2650     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2651 
2652     // Release the extra threads we don't need any more.
2653     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2654       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2655       if (__kmp_tasking_mode != tskm_immediate_exec) {
2656         // When decreasing team size, threads no longer in the team should unref
2657         // task team.
2658         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2659       }
2660       __kmp_free_thread(hot_team->t.t_threads[f]);
2661       hot_team->t.t_threads[f] = NULL;
2662     }
2663     hot_team->t.t_nproc = new_nth;
2664 #if KMP_NESTED_HOT_TEAMS
2665     if (thread->th.th_hot_teams) {
2666       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2667       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2668     }
2669 #endif
2670 
2671     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2672 
2673     // Update the t_nproc field in the threads that are still active.
2674     for (f = 0; f < new_nth; f++) {
2675       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2676       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2677     }
2678     // Special flag in case omp_set_num_threads() call
2679     hot_team->t.t_size_changed = -1;
2680   }
2681 }
2682 
2683 /* Changes max_active_levels */
2684 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2685   kmp_info_t *thread;
2686 
2687   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2688                 "%d = (%d)\n",
2689                 gtid, max_active_levels));
2690   KMP_DEBUG_ASSERT(__kmp_init_serial);
2691 
2692   // validate max_active_levels
2693   if (max_active_levels < 0) {
2694     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2695     // We ignore this call if the user has specified a negative value.
2696     // The current setting won't be changed. The last valid setting will be
2697     // used. A warning will be issued (if warnings are allowed as controlled by
2698     // the KMP_WARNINGS env var).
2699     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2700                   "max_active_levels for thread %d = (%d)\n",
2701                   gtid, max_active_levels));
2702     return;
2703   }
2704   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2705     // it's OK, the max_active_levels is within the valid range: [ 0;
2706     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2707     // We allow a zero value. (implementation defined behavior)
2708   } else {
2709     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2710                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2711     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2712     // Current upper limit is MAX_INT. (implementation defined behavior)
2713     // If the input exceeds the upper limit, we correct the input to be the
2714     // upper limit. (implementation defined behavior)
2715     // Actually, the flow should never get here until we use MAX_INT limit.
2716   }
2717   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2718                 "max_active_levels for thread %d = (%d)\n",
2719                 gtid, max_active_levels));
2720 
2721   thread = __kmp_threads[gtid];
2722 
2723   __kmp_save_internal_controls(thread);
2724 
2725   set__max_active_levels(thread, max_active_levels);
2726 }
2727 
2728 /* Gets max_active_levels */
2729 int __kmp_get_max_active_levels(int gtid) {
2730   kmp_info_t *thread;
2731 
2732   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2733   KMP_DEBUG_ASSERT(__kmp_init_serial);
2734 
2735   thread = __kmp_threads[gtid];
2736   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2737   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2738                 "curtask_maxaclevel=%d\n",
2739                 gtid, thread->th.th_current_task,
2740                 thread->th.th_current_task->td_icvs.max_active_levels));
2741   return thread->th.th_current_task->td_icvs.max_active_levels;
2742 }
2743 
2744 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2745 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2746   kmp_info_t *thread;
2747   //    kmp_team_t *team;
2748 
2749   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2750                 gtid, (int)kind, chunk));
2751   KMP_DEBUG_ASSERT(__kmp_init_serial);
2752 
2753   // Check if the kind parameter is valid, correct if needed.
2754   // Valid parameters should fit in one of two intervals - standard or extended:
2755   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2756   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2757   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2758       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2759     // TODO: Hint needs attention in case we change the default schedule.
2760     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2761               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2762               __kmp_msg_null);
2763     kind = kmp_sched_default;
2764     chunk = 0; // ignore chunk value in case of bad kind
2765   }
2766 
2767   thread = __kmp_threads[gtid];
2768 
2769   __kmp_save_internal_controls(thread);
2770 
2771   if (kind < kmp_sched_upper_std) {
2772     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2773       // differ static chunked vs. unchunked:  chunk should be invalid to
2774       // indicate unchunked schedule (which is the default)
2775       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2776     } else {
2777       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2778           __kmp_sch_map[kind - kmp_sched_lower - 1];
2779     }
2780   } else {
2781     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2782     //    kmp_sched_lower - 2 ];
2783     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2784         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2785                       kmp_sched_lower - 2];
2786   }
2787   if (kind == kmp_sched_auto || chunk < 1) {
2788     // ignore parameter chunk for schedule auto
2789     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2790   } else {
2791     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2792   }
2793 }
2794 
2795 /* Gets def_sched_var ICV values */
2796 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2797   kmp_info_t *thread;
2798   enum sched_type th_type;
2799 
2800   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2801   KMP_DEBUG_ASSERT(__kmp_init_serial);
2802 
2803   thread = __kmp_threads[gtid];
2804 
2805   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2806 
2807   switch (th_type) {
2808   case kmp_sch_static:
2809   case kmp_sch_static_greedy:
2810   case kmp_sch_static_balanced:
2811     *kind = kmp_sched_static;
2812     *chunk = 0; // chunk was not set, try to show this fact via zero value
2813     return;
2814   case kmp_sch_static_chunked:
2815     *kind = kmp_sched_static;
2816     break;
2817   case kmp_sch_dynamic_chunked:
2818     *kind = kmp_sched_dynamic;
2819     break;
2820   case kmp_sch_guided_chunked:
2821   case kmp_sch_guided_iterative_chunked:
2822   case kmp_sch_guided_analytical_chunked:
2823     *kind = kmp_sched_guided;
2824     break;
2825   case kmp_sch_auto:
2826     *kind = kmp_sched_auto;
2827     break;
2828   case kmp_sch_trapezoidal:
2829     *kind = kmp_sched_trapezoidal;
2830     break;
2831 #if KMP_STATIC_STEAL_ENABLED
2832   case kmp_sch_static_steal:
2833     *kind = kmp_sched_static_steal;
2834     break;
2835 #endif
2836   default:
2837     KMP_FATAL(UnknownSchedulingType, th_type);
2838   }
2839 
2840   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2841 }
2842 
2843 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2844 
2845   int ii, dd;
2846   kmp_team_t *team;
2847   kmp_info_t *thr;
2848 
2849   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2850   KMP_DEBUG_ASSERT(__kmp_init_serial);
2851 
2852   // validate level
2853   if (level == 0)
2854     return 0;
2855   if (level < 0)
2856     return -1;
2857   thr = __kmp_threads[gtid];
2858   team = thr->th.th_team;
2859   ii = team->t.t_level;
2860   if (level > ii)
2861     return -1;
2862 
2863 #if OMP_40_ENABLED
2864   if (thr->th.th_teams_microtask) {
2865     // AC: we are in teams region where multiple nested teams have same level
2866     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2867     if (level <=
2868         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2869       KMP_DEBUG_ASSERT(ii >= tlevel);
2870       // AC: As we need to pass by the teams league, we need to artificially
2871       // increase ii
2872       if (ii == tlevel) {
2873         ii += 2; // three teams have same level
2874       } else {
2875         ii++; // two teams have same level
2876       }
2877     }
2878   }
2879 #endif
2880 
2881   if (ii == level)
2882     return __kmp_tid_from_gtid(gtid);
2883 
2884   dd = team->t.t_serialized;
2885   level++;
2886   while (ii > level) {
2887     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2888     }
2889     if ((team->t.t_serialized) && (!dd)) {
2890       team = team->t.t_parent;
2891       continue;
2892     }
2893     if (ii > level) {
2894       team = team->t.t_parent;
2895       dd = team->t.t_serialized;
2896       ii--;
2897     }
2898   }
2899 
2900   return (dd > 1) ? (0) : (team->t.t_master_tid);
2901 }
2902 
2903 int __kmp_get_team_size(int gtid, int level) {
2904 
2905   int ii, dd;
2906   kmp_team_t *team;
2907   kmp_info_t *thr;
2908 
2909   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2910   KMP_DEBUG_ASSERT(__kmp_init_serial);
2911 
2912   // validate level
2913   if (level == 0)
2914     return 1;
2915   if (level < 0)
2916     return -1;
2917   thr = __kmp_threads[gtid];
2918   team = thr->th.th_team;
2919   ii = team->t.t_level;
2920   if (level > ii)
2921     return -1;
2922 
2923 #if OMP_40_ENABLED
2924   if (thr->th.th_teams_microtask) {
2925     // AC: we are in teams region where multiple nested teams have same level
2926     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2927     if (level <=
2928         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2929       KMP_DEBUG_ASSERT(ii >= tlevel);
2930       // AC: As we need to pass by the teams league, we need to artificially
2931       // increase ii
2932       if (ii == tlevel) {
2933         ii += 2; // three teams have same level
2934       } else {
2935         ii++; // two teams have same level
2936       }
2937     }
2938   }
2939 #endif
2940 
2941   while (ii > level) {
2942     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2943     }
2944     if (team->t.t_serialized && (!dd)) {
2945       team = team->t.t_parent;
2946       continue;
2947     }
2948     if (ii > level) {
2949       team = team->t.t_parent;
2950       ii--;
2951     }
2952   }
2953 
2954   return team->t.t_nproc;
2955 }
2956 
2957 kmp_r_sched_t __kmp_get_schedule_global() {
2958   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2959   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2960   // independently. So one can get the updated schedule here.
2961 
2962   kmp_r_sched_t r_sched;
2963 
2964   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2965   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2966   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2967   // different roots (even in OMP 2.5)
2968   if (__kmp_sched == kmp_sch_static) {
2969     r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed
2970     // schedule (balanced or greedy)
2971   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2972     r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed
2973     // schedule (iterative or analytical)
2974   } else {
2975     r_sched.r_sched_type =
2976         __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2977   }
2978 
2979   if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it
2980     // was not ever set)
2981     r_sched.chunk = KMP_DEFAULT_CHUNK;
2982   } else {
2983     r_sched.chunk = __kmp_chunk;
2984   }
2985 
2986   return r_sched;
2987 }
2988 
2989 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2990    at least argc number of *t_argv entries for the requested team. */
2991 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2992 
2993   KMP_DEBUG_ASSERT(team);
2994   if (!realloc || argc > team->t.t_max_argc) {
2995 
2996     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2997                    "current entries=%d\n",
2998                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2999     /* if previously allocated heap space for args, free them */
3000     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3001       __kmp_free((void *)team->t.t_argv);
3002 
3003     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3004       /* use unused space in the cache line for arguments */
3005       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3006       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3007                      "argv entries\n",
3008                      team->t.t_id, team->t.t_max_argc));
3009       team->t.t_argv = &team->t.t_inline_argv[0];
3010       if (__kmp_storage_map) {
3011         __kmp_print_storage_map_gtid(
3012             -1, &team->t.t_inline_argv[0],
3013             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3014             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3015             team->t.t_id);
3016       }
3017     } else {
3018       /* allocate space for arguments in the heap */
3019       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3020                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3021                                : 2 * argc;
3022       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3023                      "argv entries\n",
3024                      team->t.t_id, team->t.t_max_argc));
3025       team->t.t_argv =
3026           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3027       if (__kmp_storage_map) {
3028         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3029                                      &team->t.t_argv[team->t.t_max_argc],
3030                                      sizeof(void *) * team->t.t_max_argc,
3031                                      "team_%d.t_argv", team->t.t_id);
3032       }
3033     }
3034   }
3035 }
3036 
3037 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3038   int i;
3039   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3040   team->t.t_threads =
3041       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3042   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3043       sizeof(dispatch_shared_info_t) * num_disp_buff);
3044   team->t.t_dispatch =
3045       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3046   team->t.t_implicit_task_taskdata =
3047       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3048   team->t.t_max_nproc = max_nth;
3049 
3050   /* setup dispatch buffers */
3051   for (i = 0; i < num_disp_buff; ++i) {
3052     team->t.t_disp_buffer[i].buffer_index = i;
3053 #if OMP_45_ENABLED
3054     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3055 #endif
3056   }
3057 }
3058 
3059 static void __kmp_free_team_arrays(kmp_team_t *team) {
3060   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3061   int i;
3062   for (i = 0; i < team->t.t_max_nproc; ++i) {
3063     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3064       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3065       team->t.t_dispatch[i].th_disp_buffer = NULL;
3066     }
3067   }
3068   __kmp_free(team->t.t_threads);
3069   __kmp_free(team->t.t_disp_buffer);
3070   __kmp_free(team->t.t_dispatch);
3071   __kmp_free(team->t.t_implicit_task_taskdata);
3072   team->t.t_threads = NULL;
3073   team->t.t_disp_buffer = NULL;
3074   team->t.t_dispatch = NULL;
3075   team->t.t_implicit_task_taskdata = 0;
3076 }
3077 
3078 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3079   kmp_info_t **oldThreads = team->t.t_threads;
3080 
3081   __kmp_free(team->t.t_disp_buffer);
3082   __kmp_free(team->t.t_dispatch);
3083   __kmp_free(team->t.t_implicit_task_taskdata);
3084   __kmp_allocate_team_arrays(team, max_nth);
3085 
3086   KMP_MEMCPY(team->t.t_threads, oldThreads,
3087              team->t.t_nproc * sizeof(kmp_info_t *));
3088 
3089   __kmp_free(oldThreads);
3090 }
3091 
3092 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3093 
3094   kmp_r_sched_t r_sched =
3095       __kmp_get_schedule_global(); // get current state of scheduling globals
3096 
3097 #if OMP_40_ENABLED
3098   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3099 #endif /* OMP_40_ENABLED */
3100 
3101   kmp_internal_control_t g_icvs = {
3102     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3103     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3104     // for nested parallelism (per thread)
3105     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3106     // adjustment of threads (per thread)
3107     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3108     // whether blocktime is explicitly set
3109     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3110 #if KMP_USE_MONITOR
3111     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3112 // intervals
3113 #endif
3114     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3115     // next parallel region (per thread)
3116     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3117     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3118     // for max_active_levels
3119     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3120 // {sched,chunk} pair
3121 #if OMP_40_ENABLED
3122     __kmp_nested_proc_bind.bind_types[0],
3123     __kmp_default_device,
3124 #endif /* OMP_40_ENABLED */
3125     NULL // struct kmp_internal_control *next;
3126   };
3127 
3128   return g_icvs;
3129 }
3130 
3131 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3132 
3133   kmp_internal_control_t gx_icvs;
3134   gx_icvs.serial_nesting_level =
3135       0; // probably =team->t.t_serial like in save_inter_controls
3136   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3137   gx_icvs.next = NULL;
3138 
3139   return gx_icvs;
3140 }
3141 
3142 static void __kmp_initialize_root(kmp_root_t *root) {
3143   int f;
3144   kmp_team_t *root_team;
3145   kmp_team_t *hot_team;
3146   int hot_team_max_nth;
3147   kmp_r_sched_t r_sched =
3148       __kmp_get_schedule_global(); // get current state of scheduling globals
3149   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3150   KMP_DEBUG_ASSERT(root);
3151   KMP_ASSERT(!root->r.r_begin);
3152 
3153   /* setup the root state structure */
3154   __kmp_init_lock(&root->r.r_begin_lock);
3155   root->r.r_begin = FALSE;
3156   root->r.r_active = FALSE;
3157   root->r.r_in_parallel = 0;
3158   root->r.r_blocktime = __kmp_dflt_blocktime;
3159   root->r.r_nested = __kmp_dflt_nested;
3160   root->r.r_cg_nthreads = 1;
3161 
3162   /* setup the root team for this task */
3163   /* allocate the root team structure */
3164   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3165 
3166   root_team =
3167       __kmp_allocate_team(root,
3168                           1, // new_nproc
3169                           1, // max_nproc
3170 #if OMPT_SUPPORT
3171                           ompt_data_none, // root parallel id
3172 #endif
3173 #if OMP_40_ENABLED
3174                           __kmp_nested_proc_bind.bind_types[0],
3175 #endif
3176                           &r_icvs,
3177                           0 // argc
3178                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3179                           );
3180 #if USE_DEBUGGER
3181   // Non-NULL value should be assigned to make the debugger display the root
3182   // team.
3183   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3184 #endif
3185 
3186   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3187 
3188   root->r.r_root_team = root_team;
3189   root_team->t.t_control_stack_top = NULL;
3190 
3191   /* initialize root team */
3192   root_team->t.t_threads[0] = NULL;
3193   root_team->t.t_nproc = 1;
3194   root_team->t.t_serialized = 1;
3195   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3196   root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3197   root_team->t.t_sched.chunk = r_sched.chunk;
3198   KA_TRACE(
3199       20,
3200       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3201        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3202 
3203   /* setup the  hot team for this task */
3204   /* allocate the hot team structure */
3205   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3206 
3207   hot_team =
3208       __kmp_allocate_team(root,
3209                           1, // new_nproc
3210                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3211 #if OMPT_SUPPORT
3212                           ompt_data_none, // root parallel id
3213 #endif
3214 #if OMP_40_ENABLED
3215                           __kmp_nested_proc_bind.bind_types[0],
3216 #endif
3217                           &r_icvs,
3218                           0 // argc
3219                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3220                           );
3221   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3222 
3223   root->r.r_hot_team = hot_team;
3224   root_team->t.t_control_stack_top = NULL;
3225 
3226   /* first-time initialization */
3227   hot_team->t.t_parent = root_team;
3228 
3229   /* initialize hot team */
3230   hot_team_max_nth = hot_team->t.t_max_nproc;
3231   for (f = 0; f < hot_team_max_nth; ++f) {
3232     hot_team->t.t_threads[f] = NULL;
3233   }
3234   hot_team->t.t_nproc = 1;
3235   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3236   hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3237   hot_team->t.t_sched.chunk = r_sched.chunk;
3238   hot_team->t.t_size_changed = 0;
3239 }
3240 
3241 #ifdef KMP_DEBUG
3242 
3243 typedef struct kmp_team_list_item {
3244   kmp_team_p const *entry;
3245   struct kmp_team_list_item *next;
3246 } kmp_team_list_item_t;
3247 typedef kmp_team_list_item_t *kmp_team_list_t;
3248 
3249 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3250     kmp_team_list_t list, // List of teams.
3251     kmp_team_p const *team // Team to add.
3252     ) {
3253 
3254   // List must terminate with item where both entry and next are NULL.
3255   // Team is added to the list only once.
3256   // List is sorted in ascending order by team id.
3257   // Team id is *not* a key.
3258 
3259   kmp_team_list_t l;
3260 
3261   KMP_DEBUG_ASSERT(list != NULL);
3262   if (team == NULL) {
3263     return;
3264   }
3265 
3266   __kmp_print_structure_team_accum(list, team->t.t_parent);
3267   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3268 
3269   // Search list for the team.
3270   l = list;
3271   while (l->next != NULL && l->entry != team) {
3272     l = l->next;
3273   }
3274   if (l->next != NULL) {
3275     return; // Team has been added before, exit.
3276   }
3277 
3278   // Team is not found. Search list again for insertion point.
3279   l = list;
3280   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3281     l = l->next;
3282   }
3283 
3284   // Insert team.
3285   {
3286     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3287         sizeof(kmp_team_list_item_t));
3288     *item = *l;
3289     l->entry = team;
3290     l->next = item;
3291   }
3292 }
3293 
3294 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3295 
3296                                        ) {
3297   __kmp_printf("%s", title);
3298   if (team != NULL) {
3299     __kmp_printf("%2x %p\n", team->t.t_id, team);
3300   } else {
3301     __kmp_printf(" - (nil)\n");
3302   }
3303 }
3304 
3305 static void __kmp_print_structure_thread(char const *title,
3306                                          kmp_info_p const *thread) {
3307   __kmp_printf("%s", title);
3308   if (thread != NULL) {
3309     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3310   } else {
3311     __kmp_printf(" - (nil)\n");
3312   }
3313 }
3314 
3315 void __kmp_print_structure(void) {
3316 
3317   kmp_team_list_t list;
3318 
3319   // Initialize list of teams.
3320   list =
3321       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3322   list->entry = NULL;
3323   list->next = NULL;
3324 
3325   __kmp_printf("\n------------------------------\nGlobal Thread "
3326                "Table\n------------------------------\n");
3327   {
3328     int gtid;
3329     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3330       __kmp_printf("%2d", gtid);
3331       if (__kmp_threads != NULL) {
3332         __kmp_printf(" %p", __kmp_threads[gtid]);
3333       }
3334       if (__kmp_root != NULL) {
3335         __kmp_printf(" %p", __kmp_root[gtid]);
3336       }
3337       __kmp_printf("\n");
3338     }
3339   }
3340 
3341   // Print out __kmp_threads array.
3342   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3343                "----------\n");
3344   if (__kmp_threads != NULL) {
3345     int gtid;
3346     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3347       kmp_info_t const *thread = __kmp_threads[gtid];
3348       if (thread != NULL) {
3349         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3350         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3351         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3352         __kmp_print_structure_team("    Serial Team:  ",
3353                                    thread->th.th_serial_team);
3354         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3355         __kmp_print_structure_thread("    Master:       ",
3356                                      thread->th.th_team_master);
3357         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3358         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3359 #if OMP_40_ENABLED
3360         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361 #endif
3362         __kmp_print_structure_thread("    Next in pool: ",
3363                                      thread->th.th_next_pool);
3364         __kmp_printf("\n");
3365         __kmp_print_structure_team_accum(list, thread->th.th_team);
3366         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3367       }
3368     }
3369   } else {
3370     __kmp_printf("Threads array is not allocated.\n");
3371   }
3372 
3373   // Print out __kmp_root array.
3374   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3375                "--------\n");
3376   if (__kmp_root != NULL) {
3377     int gtid;
3378     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3379       kmp_root_t const *root = __kmp_root[gtid];
3380       if (root != NULL) {
3381         __kmp_printf("GTID %2d %p:\n", gtid, root);
3382         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3383         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3384         __kmp_print_structure_thread("    Uber Thread:  ",
3385                                      root->r.r_uber_thread);
3386         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3387         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3388         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3389         __kmp_printf("\n");
3390         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3391         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3392       }
3393     }
3394   } else {
3395     __kmp_printf("Ubers array is not allocated.\n");
3396   }
3397 
3398   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3399                "--------\n");
3400   while (list->next != NULL) {
3401     kmp_team_p const *team = list->entry;
3402     int i;
3403     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3404     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3405     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3406     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3407     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3408     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3409     for (i = 0; i < team->t.t_nproc; ++i) {
3410       __kmp_printf("    Thread %2d:      ", i);
3411       __kmp_print_structure_thread("", team->t.t_threads[i]);
3412     }
3413     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3414     __kmp_printf("\n");
3415     list = list->next;
3416   }
3417 
3418   // Print out __kmp_thread_pool and __kmp_team_pool.
3419   __kmp_printf("\n------------------------------\nPools\n----------------------"
3420                "--------\n");
3421   __kmp_print_structure_thread("Thread pool:          ",
3422                                CCAST(kmp_info_t *, __kmp_thread_pool));
3423   __kmp_print_structure_team("Team pool:            ",
3424                              CCAST(kmp_team_t *, __kmp_team_pool));
3425   __kmp_printf("\n");
3426 
3427   // Free team list.
3428   while (list != NULL) {
3429     kmp_team_list_item_t *item = list;
3430     list = list->next;
3431     KMP_INTERNAL_FREE(item);
3432   }
3433 }
3434 
3435 #endif
3436 
3437 //---------------------------------------------------------------------------
3438 //  Stuff for per-thread fast random number generator
3439 //  Table of primes
3440 static const unsigned __kmp_primes[] = {
3441     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3442     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3443     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3444     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3445     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3446     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3447     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3448     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3449     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3450     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3451     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3452 
3453 //---------------------------------------------------------------------------
3454 //  __kmp_get_random: Get a random number using a linear congruential method.
3455 unsigned short __kmp_get_random(kmp_info_t *thread) {
3456   unsigned x = thread->th.th_x;
3457   unsigned short r = x >> 16;
3458 
3459   thread->th.th_x = x * thread->th.th_a + 1;
3460 
3461   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3462                 thread->th.th_info.ds.ds_tid, r));
3463 
3464   return r;
3465 }
3466 //--------------------------------------------------------
3467 // __kmp_init_random: Initialize a random number generator
3468 void __kmp_init_random(kmp_info_t *thread) {
3469   unsigned seed = thread->th.th_info.ds.ds_tid;
3470 
3471   thread->th.th_a =
3472       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3473   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3474   KA_TRACE(30,
3475            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3476 }
3477 
3478 #if KMP_OS_WINDOWS
3479 /* reclaim array entries for root threads that are already dead, returns number
3480  * reclaimed */
3481 static int __kmp_reclaim_dead_roots(void) {
3482   int i, r = 0;
3483 
3484   for (i = 0; i < __kmp_threads_capacity; ++i) {
3485     if (KMP_UBER_GTID(i) &&
3486         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3487         !__kmp_root[i]
3488              ->r.r_active) { // AC: reclaim only roots died in non-active state
3489       r += __kmp_unregister_root_other_thread(i);
3490     }
3491   }
3492   return r;
3493 }
3494 #endif
3495 
3496 /* This function attempts to create free entries in __kmp_threads and
3497    __kmp_root, and returns the number of free entries generated.
3498 
3499    For Windows* OS static library, the first mechanism used is to reclaim array
3500    entries for root threads that are already dead.
3501 
3502    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3503    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3504    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3505    threadprivate cache array has been created. Synchronization with
3506    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3507 
3508    After any dead root reclamation, if the clipping value allows array expansion
3509    to result in the generation of a total of nWish free slots, the function does
3510    that expansion. If not, but the clipping value allows array expansion to
3511    result in the generation of a total of nNeed free slots, the function does
3512    that expansion. Otherwise, nothing is done beyond the possible initial root
3513    thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3514    to fulfil nWish as far as possible, i.e. the function will attempt to create
3515    as many free slots as possible up to nWish.
3516 
3517    If any argument is negative, the behavior is undefined. */
3518 static int __kmp_expand_threads(int nWish, int nNeed) {
3519   int added = 0;
3520   int old_tp_cached;
3521   int __kmp_actual_max_nth;
3522 
3523   if (nNeed > nWish) /* normalize the arguments */
3524     nWish = nNeed;
3525 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3526   /* only for Windows static library */
3527   /* reclaim array entries for root threads that are already dead */
3528   added = __kmp_reclaim_dead_roots();
3529 
3530   if (nNeed) {
3531     nNeed -= added;
3532     if (nNeed < 0)
3533       nNeed = 0;
3534   }
3535   if (nWish) {
3536     nWish -= added;
3537     if (nWish < 0)
3538       nWish = 0;
3539   }
3540 #endif
3541   if (nWish <= 0)
3542     return added;
3543 
3544   while (1) {
3545     int nTarget;
3546     int minimumRequiredCapacity;
3547     int newCapacity;
3548     kmp_info_t **newThreads;
3549     kmp_root_t **newRoot;
3550 
3551     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3552     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3553     // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3554     // > __kmp_max_nth in one of two ways:
3555     //
3556     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3557     //    may not be resused by another thread, so we may need to increase
3558     //    __kmp_threads_capacity to __kmp_max_nth + 1.
3559     //
3560     // 2) New foreign root(s) are encountered.  We always register new foreign
3561     //    roots. This may cause a smaller # of threads to be allocated at
3562     //    subsequent parallel regions, but the worker threads hang around (and
3563     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3564     //
3565     // Anyway, that is the reason for moving the check to see if
3566     // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3567     // instead of having it performed here. -BB
3568     old_tp_cached = __kmp_tp_cached;
3569     __kmp_actual_max_nth =
3570         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3571     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3572 
3573     /* compute expansion headroom to check if we can expand and whether to aim
3574        for nWish or nNeed */
3575     nTarget = nWish;
3576     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3577       /* can't fulfil nWish, so try nNeed */
3578       if (nNeed) {
3579         nTarget = nNeed;
3580         if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3581           /* possible expansion too small -- give up */
3582           break;
3583         }
3584       } else {
3585         /* best-effort */
3586         nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3587         if (!nTarget) {
3588           /* can expand at all -- give up */
3589           break;
3590         }
3591       }
3592     }
3593     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3594 
3595     newCapacity = __kmp_threads_capacity;
3596     do {
3597       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3598                         ? (newCapacity << 1)
3599                         : __kmp_actual_max_nth;
3600     } while (newCapacity < minimumRequiredCapacity);
3601     newThreads = (kmp_info_t **)__kmp_allocate(
3602         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3603         CACHE_LINE);
3604     newRoot = (kmp_root_t **)((char *)newThreads +
3605                               sizeof(kmp_info_t *) * newCapacity);
3606     KMP_MEMCPY(newThreads, __kmp_threads,
3607                __kmp_threads_capacity * sizeof(kmp_info_t *));
3608     KMP_MEMCPY(newRoot, __kmp_root,
3609                __kmp_threads_capacity * sizeof(kmp_root_t *));
3610     memset(newThreads + __kmp_threads_capacity, 0,
3611            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3612     memset(newRoot + __kmp_threads_capacity, 0,
3613            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3614 
3615     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3616       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3617          allocated a threadprivate cache while we were allocating the expanded
3618          array, and our new capacity is larger than the threadprivate cache
3619          capacity, so we should deallocate the expanded arrays and try again.
3620          This is the first check of a double-check pair. */
3621       __kmp_free(newThreads);
3622       continue; /* start over and try again */
3623     }
3624     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3625     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3626       /* Same check as above, but this time with the lock so we can be sure if
3627          we can succeed. */
3628       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3629       __kmp_free(newThreads);
3630       continue; /* start over and try again */
3631     } else {
3632       /* success */
3633       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3634       // investigated.
3635       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3636       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3637       added += newCapacity - __kmp_threads_capacity;
3638       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3639       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3640       break; /* succeeded, so we can exit the loop */
3641     }
3642   }
3643   return added;
3644 }
3645 
3646 /* Register the current thread as a root thread and obtain our gtid. We must
3647    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3648    thread that calls from __kmp_do_serial_initialize() */
3649 int __kmp_register_root(int initial_thread) {
3650   kmp_info_t *root_thread;
3651   kmp_root_t *root;
3652   int gtid;
3653   int capacity;
3654   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3655   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3656   KMP_MB();
3657 
3658   /* 2007-03-02:
3659      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3660      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3661      work as expected -- it may return false (that means there is at least one
3662      empty slot in __kmp_threads array), but it is possible the only free slot
3663      is #0, which is reserved for initial thread and so cannot be used for this
3664      one. Following code workarounds this bug.
3665 
3666      However, right solution seems to be not reserving slot #0 for initial
3667      thread because:
3668      (1) there is no magic in slot #0,
3669      (2) we cannot detect initial thread reliably (the first thread which does
3670         serial initialization may be not a real initial thread).
3671   */
3672   capacity = __kmp_threads_capacity;
3673   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3674     --capacity;
3675   }
3676 
3677   /* see if there are too many threads */
3678   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3679     if (__kmp_tp_cached) {
3680       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3681                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3682                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3683     } else {
3684       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3685                   __kmp_msg_null);
3686     }
3687   }
3688 
3689   /* find an available thread slot */
3690   /* Don't reassign the zero slot since we need that to only be used by initial
3691      thread */
3692   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3693        gtid++)
3694     ;
3695   KA_TRACE(1,
3696            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3697   KMP_ASSERT(gtid < __kmp_threads_capacity);
3698 
3699   /* update global accounting */
3700   __kmp_all_nth++;
3701   TCW_4(__kmp_nth, __kmp_nth + 1);
3702 
3703   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3704   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3705   if (__kmp_adjust_gtid_mode) {
3706     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3707       if (TCR_4(__kmp_gtid_mode) != 2) {
3708         TCW_4(__kmp_gtid_mode, 2);
3709       }
3710     } else {
3711       if (TCR_4(__kmp_gtid_mode) != 1) {
3712         TCW_4(__kmp_gtid_mode, 1);
3713       }
3714     }
3715   }
3716 
3717 #ifdef KMP_ADJUST_BLOCKTIME
3718   /* Adjust blocktime to zero if necessary            */
3719   /* Middle initialization might not have occurred yet */
3720   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3721     if (__kmp_nth > __kmp_avail_proc) {
3722       __kmp_zero_bt = TRUE;
3723     }
3724   }
3725 #endif /* KMP_ADJUST_BLOCKTIME */
3726 
3727   /* setup this new hierarchy */
3728   if (!(root = __kmp_root[gtid])) {
3729     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3730     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3731   }
3732 
3733 #if KMP_STATS_ENABLED
3734   // Initialize stats as soon as possible (right after gtid assignment).
3735   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3736   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3737   KMP_SET_THREAD_STATE(SERIAL_REGION);
3738   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3739 #endif
3740   __kmp_initialize_root(root);
3741 
3742   /* setup new root thread structure */
3743   if (root->r.r_uber_thread) {
3744     root_thread = root->r.r_uber_thread;
3745   } else {
3746     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3747     if (__kmp_storage_map) {
3748       __kmp_print_thread_storage_map(root_thread, gtid);
3749     }
3750     root_thread->th.th_info.ds.ds_gtid = gtid;
3751 #if OMPT_SUPPORT
3752     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3753 #endif
3754     root_thread->th.th_root = root;
3755     if (__kmp_env_consistency_check) {
3756       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3757     }
3758 #if USE_FAST_MEMORY
3759     __kmp_initialize_fast_memory(root_thread);
3760 #endif /* USE_FAST_MEMORY */
3761 
3762 #if KMP_USE_BGET
3763     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3764     __kmp_initialize_bget(root_thread);
3765 #endif
3766     __kmp_init_random(root_thread); // Initialize random number generator
3767   }
3768 
3769   /* setup the serial team held in reserve by the root thread */
3770   if (!root_thread->th.th_serial_team) {
3771     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3772     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3773     root_thread->th.th_serial_team =
3774         __kmp_allocate_team(root, 1, 1,
3775 #if OMPT_SUPPORT
3776                             ompt_data_none, // root parallel id
3777 #endif
3778 #if OMP_40_ENABLED
3779                             proc_bind_default,
3780 #endif
3781                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3782   }
3783   KMP_ASSERT(root_thread->th.th_serial_team);
3784   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3785                 root_thread->th.th_serial_team));
3786 
3787   /* drop root_thread into place */
3788   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3789 
3790   root->r.r_root_team->t.t_threads[0] = root_thread;
3791   root->r.r_hot_team->t.t_threads[0] = root_thread;
3792   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3793   // AC: the team created in reserve, not for execution (it is unused for now).
3794   root_thread->th.th_serial_team->t.t_serialized = 0;
3795   root->r.r_uber_thread = root_thread;
3796 
3797   /* initialize the thread, get it ready to go */
3798   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3799   TCW_4(__kmp_init_gtid, TRUE);
3800 
3801   /* prepare the master thread for get_gtid() */
3802   __kmp_gtid_set_specific(gtid);
3803 
3804 #if USE_ITT_BUILD
3805   __kmp_itt_thread_name(gtid);
3806 #endif /* USE_ITT_BUILD */
3807 
3808 #ifdef KMP_TDATA_GTID
3809   __kmp_gtid = gtid;
3810 #endif
3811   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3812   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3813 
3814   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3815                 "plain=%u\n",
3816                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3817                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3818                 KMP_INIT_BARRIER_STATE));
3819   { // Initialize barrier data.
3820     int b;
3821     for (b = 0; b < bs_last_barrier; ++b) {
3822       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3823 #if USE_DEBUGGER
3824       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3825 #endif
3826     }
3827   }
3828   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3829                    KMP_INIT_BARRIER_STATE);
3830 
3831 #if KMP_AFFINITY_SUPPORTED
3832 #if OMP_40_ENABLED
3833   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3834   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3835   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3836   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3837 #endif
3838 
3839   if (TCR_4(__kmp_init_middle)) {
3840     __kmp_affinity_set_init_mask(gtid, TRUE);
3841   }
3842 #endif /* KMP_AFFINITY_SUPPORTED */
3843 
3844   __kmp_root_counter++;
3845 
3846 #if OMPT_SUPPORT
3847   if (!initial_thread && ompt_enabled.enabled) {
3848 
3849     ompt_thread_t *root_thread = ompt_get_thread();
3850 
3851     ompt_set_thread_state(root_thread, omp_state_overhead);
3852 
3853     if (ompt_enabled.ompt_callback_thread_begin) {
3854       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3855           ompt_thread_initial, __ompt_get_thread_data_internal());
3856     }
3857     ompt_data_t *task_data;
3858     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3859     if (ompt_enabled.ompt_callback_task_create) {
3860       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3861           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3862       // initial task has nothing to return to
3863     }
3864 
3865     ompt_set_thread_state(root_thread, omp_state_work_serial);
3866   }
3867 #endif
3868 
3869   KMP_MB();
3870   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3871 
3872   return gtid;
3873 }
3874 
3875 #if KMP_NESTED_HOT_TEAMS
3876 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3877                                 const int max_level) {
3878   int i, n, nth;
3879   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3880   if (!hot_teams || !hot_teams[level].hot_team) {
3881     return 0;
3882   }
3883   KMP_DEBUG_ASSERT(level < max_level);
3884   kmp_team_t *team = hot_teams[level].hot_team;
3885   nth = hot_teams[level].hot_team_nth;
3886   n = nth - 1; // master is not freed
3887   if (level < max_level - 1) {
3888     for (i = 0; i < nth; ++i) {
3889       kmp_info_t *th = team->t.t_threads[i];
3890       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3891       if (i > 0 && th->th.th_hot_teams) {
3892         __kmp_free(th->th.th_hot_teams);
3893         th->th.th_hot_teams = NULL;
3894       }
3895     }
3896   }
3897   __kmp_free_team(root, team, NULL);
3898   return n;
3899 }
3900 #endif
3901 
3902 // Resets a root thread and clear its root and hot teams.
3903 // Returns the number of __kmp_threads entries directly and indirectly freed.
3904 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3905   kmp_team_t *root_team = root->r.r_root_team;
3906   kmp_team_t *hot_team = root->r.r_hot_team;
3907   int n = hot_team->t.t_nproc;
3908   int i;
3909 
3910   KMP_DEBUG_ASSERT(!root->r.r_active);
3911 
3912   root->r.r_root_team = NULL;
3913   root->r.r_hot_team = NULL;
3914   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3915   // before call to __kmp_free_team().
3916   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3917 #if KMP_NESTED_HOT_TEAMS
3918   if (__kmp_hot_teams_max_level >
3919       0) { // need to free nested hot teams and their threads if any
3920     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3921       kmp_info_t *th = hot_team->t.t_threads[i];
3922       if (__kmp_hot_teams_max_level > 1) {
3923         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3924       }
3925       if (th->th.th_hot_teams) {
3926         __kmp_free(th->th.th_hot_teams);
3927         th->th.th_hot_teams = NULL;
3928       }
3929     }
3930   }
3931 #endif
3932   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3933 
3934   // Before we can reap the thread, we need to make certain that all other
3935   // threads in the teams that had this root as ancestor have stopped trying to
3936   // steal tasks.
3937   if (__kmp_tasking_mode != tskm_immediate_exec) {
3938     __kmp_wait_to_unref_task_teams();
3939   }
3940 
3941 #if KMP_OS_WINDOWS
3942   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3943   KA_TRACE(
3944       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3945            "\n",
3946            (LPVOID) & (root->r.r_uber_thread->th),
3947            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3948   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3949 #endif /* KMP_OS_WINDOWS */
3950 
3951 #if OMPT_SUPPORT
3952   if (ompt_enabled.ompt_callback_thread_end) {
3953     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3954         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3955   }
3956 #endif
3957 
3958   TCW_4(__kmp_nth,
3959         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3960   root->r.r_cg_nthreads--;
3961 
3962   __kmp_reap_thread(root->r.r_uber_thread, 1);
3963 
3964   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3965   // of freeing.
3966   root->r.r_uber_thread = NULL;
3967   /* mark root as no longer in use */
3968   root->r.r_begin = FALSE;
3969 
3970   return n;
3971 }
3972 
3973 void __kmp_unregister_root_current_thread(int gtid) {
3974   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3975   /* this lock should be ok, since unregister_root_current_thread is never
3976      called during an abort, only during a normal close. furthermore, if you
3977      have the forkjoin lock, you should never try to get the initz lock */
3978   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3979   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3980     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3981                   "exiting T#%d\n",
3982                   gtid));
3983     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3984     return;
3985   }
3986   kmp_root_t *root = __kmp_root[gtid];
3987 
3988   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3989   KMP_ASSERT(KMP_UBER_GTID(gtid));
3990   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3991   KMP_ASSERT(root->r.r_active == FALSE);
3992 
3993   KMP_MB();
3994 
3995 #if OMP_45_ENABLED
3996   kmp_info_t *thread = __kmp_threads[gtid];
3997   kmp_team_t *team = thread->th.th_team;
3998   kmp_task_team_t *task_team = thread->th.th_task_team;
3999 
4000   // we need to wait for the proxy tasks before finishing the thread
4001   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4002 #if OMPT_SUPPORT
4003     // the runtime is shutting down so we won't report any events
4004     thread->th.ompt_thread_info.state = omp_state_undefined;
4005 #endif
4006     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4007   }
4008 #endif
4009 
4010   __kmp_reset_root(gtid, root);
4011 
4012   /* free up this thread slot */
4013   __kmp_gtid_set_specific(KMP_GTID_DNE);
4014 #ifdef KMP_TDATA_GTID
4015   __kmp_gtid = KMP_GTID_DNE;
4016 #endif
4017 
4018   KMP_MB();
4019   KC_TRACE(10,
4020            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4021 
4022   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4023 }
4024 
4025 #if KMP_OS_WINDOWS
4026 /* __kmp_forkjoin_lock must be already held
4027    Unregisters a root thread that is not the current thread.  Returns the number
4028    of __kmp_threads entries freed as a result. */
4029 static int __kmp_unregister_root_other_thread(int gtid) {
4030   kmp_root_t *root = __kmp_root[gtid];
4031   int r;
4032 
4033   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4034   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4035   KMP_ASSERT(KMP_UBER_GTID(gtid));
4036   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4037   KMP_ASSERT(root->r.r_active == FALSE);
4038 
4039   r = __kmp_reset_root(gtid, root);
4040   KC_TRACE(10,
4041            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4042   return r;
4043 }
4044 #endif
4045 
4046 #if KMP_DEBUG
4047 void __kmp_task_info() {
4048 
4049   kmp_int32 gtid = __kmp_entry_gtid();
4050   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4051   kmp_info_t *this_thr = __kmp_threads[gtid];
4052   kmp_team_t *steam = this_thr->th.th_serial_team;
4053   kmp_team_t *team = this_thr->th.th_team;
4054 
4055   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4056                "ptask=%p\n",
4057                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4058                team->t.t_implicit_task_taskdata[tid].td_parent);
4059 }
4060 #endif // KMP_DEBUG
4061 
4062 /* TODO optimize with one big memclr, take out what isn't needed, split
4063    responsibility to workers as much as possible, and delay initialization of
4064    features as much as possible  */
4065 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4066                                   int tid, int gtid) {
4067   /* this_thr->th.th_info.ds.ds_gtid is setup in
4068      kmp_allocate_thread/create_worker.
4069      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4070   kmp_info_t *master = team->t.t_threads[0];
4071   KMP_DEBUG_ASSERT(this_thr != NULL);
4072   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4073   KMP_DEBUG_ASSERT(team);
4074   KMP_DEBUG_ASSERT(team->t.t_threads);
4075   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4076   KMP_DEBUG_ASSERT(master);
4077   KMP_DEBUG_ASSERT(master->th.th_root);
4078 
4079   KMP_MB();
4080 
4081   TCW_SYNC_PTR(this_thr->th.th_team, team);
4082 
4083   this_thr->th.th_info.ds.ds_tid = tid;
4084   this_thr->th.th_set_nproc = 0;
4085   if (__kmp_tasking_mode != tskm_immediate_exec)
4086     // When tasking is possible, threads are not safe to reap until they are
4087     // done tasking; this will be set when tasking code is exited in wait
4088     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4089   else // no tasking --> always safe to reap
4090     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4091 #if OMP_40_ENABLED
4092   this_thr->th.th_set_proc_bind = proc_bind_default;
4093 #if KMP_AFFINITY_SUPPORTED
4094   this_thr->th.th_new_place = this_thr->th.th_current_place;
4095 #endif
4096 #endif
4097   this_thr->th.th_root = master->th.th_root;
4098 
4099   /* setup the thread's cache of the team structure */
4100   this_thr->th.th_team_nproc = team->t.t_nproc;
4101   this_thr->th.th_team_master = master;
4102   this_thr->th.th_team_serialized = team->t.t_serialized;
4103   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4104 
4105   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4106 
4107   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4108                 tid, gtid, this_thr, this_thr->th.th_current_task));
4109 
4110   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4111                            team, tid, TRUE);
4112 
4113   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4114                 tid, gtid, this_thr, this_thr->th.th_current_task));
4115   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4116   // __kmp_initialize_team()?
4117 
4118   /* TODO no worksharing in speculative threads */
4119   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4120 
4121   this_thr->th.th_local.this_construct = 0;
4122 
4123   if (!this_thr->th.th_pri_common) {
4124     this_thr->th.th_pri_common =
4125         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4126     if (__kmp_storage_map) {
4127       __kmp_print_storage_map_gtid(
4128           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4129           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4130     }
4131     this_thr->th.th_pri_head = NULL;
4132   }
4133 
4134   /* Initialize dynamic dispatch */
4135   {
4136     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4137     // Use team max_nproc since this will never change for the team.
4138     size_t disp_size =
4139         sizeof(dispatch_private_info_t) *
4140         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4141     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4142                   team->t.t_max_nproc));
4143     KMP_ASSERT(dispatch);
4144     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4145     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4146 
4147     dispatch->th_disp_index = 0;
4148 #if OMP_45_ENABLED
4149     dispatch->th_doacross_buf_idx = 0;
4150 #endif
4151     if (!dispatch->th_disp_buffer) {
4152       dispatch->th_disp_buffer =
4153           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4154 
4155       if (__kmp_storage_map) {
4156         __kmp_print_storage_map_gtid(
4157             gtid, &dispatch->th_disp_buffer[0],
4158             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4159                                           ? 1
4160                                           : __kmp_dispatch_num_buffers],
4161             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4162                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4163             gtid, team->t.t_id, gtid);
4164       }
4165     } else {
4166       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4167     }
4168 
4169     dispatch->th_dispatch_pr_current = 0;
4170     dispatch->th_dispatch_sh_current = 0;
4171 
4172     dispatch->th_deo_fcn = 0; /* ORDERED     */
4173     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4174   }
4175 
4176   this_thr->th.th_next_pool = NULL;
4177 
4178   if (!this_thr->th.th_task_state_memo_stack) {
4179     size_t i;
4180     this_thr->th.th_task_state_memo_stack =
4181         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4182     this_thr->th.th_task_state_top = 0;
4183     this_thr->th.th_task_state_stack_sz = 4;
4184     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4185          ++i) // zero init the stack
4186       this_thr->th.th_task_state_memo_stack[i] = 0;
4187   }
4188 
4189   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4190   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4191 
4192   KMP_MB();
4193 }
4194 
4195 /* allocate a new thread for the requesting team. this is only called from
4196    within a forkjoin critical section. we will first try to get an available
4197    thread from the thread pool. if none is available, we will fork a new one
4198    assuming we are able to create a new one. this should be assured, as the
4199    caller should check on this first. */
4200 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4201                                   int new_tid) {
4202   kmp_team_t *serial_team;
4203   kmp_info_t *new_thr;
4204   int new_gtid;
4205 
4206   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4207   KMP_DEBUG_ASSERT(root && team);
4208 #if !KMP_NESTED_HOT_TEAMS
4209   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4210 #endif
4211   KMP_MB();
4212 
4213   /* first, try to get one from the thread pool */
4214   if (__kmp_thread_pool) {
4215 
4216     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4217     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4218     if (new_thr == __kmp_thread_pool_insert_pt) {
4219       __kmp_thread_pool_insert_pt = NULL;
4220     }
4221     TCW_4(new_thr->th.th_in_pool, FALSE);
4222     // Don't touch th_active_in_pool or th_active.
4223     // The worker thread adjusts those flags as it sleeps/awakens.
4224     __kmp_thread_pool_nth--;
4225 
4226     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4227                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4228     KMP_ASSERT(!new_thr->th.th_team);
4229     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4230     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4231 
4232     /* setup the thread structure */
4233     __kmp_initialize_info(new_thr, team, new_tid,
4234                           new_thr->th.th_info.ds.ds_gtid);
4235     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4236 
4237     TCW_4(__kmp_nth, __kmp_nth + 1);
4238     root->r.r_cg_nthreads++;
4239 
4240     new_thr->th.th_task_state = 0;
4241     new_thr->th.th_task_state_top = 0;
4242     new_thr->th.th_task_state_stack_sz = 4;
4243 
4244 #ifdef KMP_ADJUST_BLOCKTIME
4245     /* Adjust blocktime back to zero if necessary */
4246     /* Middle initialization might not have occurred yet */
4247     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4248       if (__kmp_nth > __kmp_avail_proc) {
4249         __kmp_zero_bt = TRUE;
4250       }
4251     }
4252 #endif /* KMP_ADJUST_BLOCKTIME */
4253 
4254 #if KMP_DEBUG
4255     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4256     // KMP_BARRIER_PARENT_FLAG.
4257     int b;
4258     kmp_balign_t *balign = new_thr->th.th_bar;
4259     for (b = 0; b < bs_last_barrier; ++b)
4260       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4261 #endif
4262 
4263     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4264                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4265 
4266     KMP_MB();
4267     return new_thr;
4268   }
4269 
4270   /* no, well fork a new one */
4271   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4272   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4273 
4274 #if KMP_USE_MONITOR
4275   // If this is the first worker thread the RTL is creating, then also
4276   // launch the monitor thread.  We try to do this as early as possible.
4277   if (!TCR_4(__kmp_init_monitor)) {
4278     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4279     if (!TCR_4(__kmp_init_monitor)) {
4280       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4281       TCW_4(__kmp_init_monitor, 1);
4282       __kmp_create_monitor(&__kmp_monitor);
4283       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4284 #if KMP_OS_WINDOWS
4285       // AC: wait until monitor has started. This is a fix for CQ232808.
4286       // The reason is that if the library is loaded/unloaded in a loop with
4287       // small (parallel) work in between, then there is high probability that
4288       // monitor thread started after the library shutdown. At shutdown it is
4289       // too late to cope with the problem, because when the master is in
4290       // DllMain (process detach) the monitor has no chances to start (it is
4291       // blocked), and master has no means to inform the monitor that the
4292       // library has gone, because all the memory which the monitor can access
4293       // is going to be released/reset.
4294       while (TCR_4(__kmp_init_monitor) < 2) {
4295         KMP_YIELD(TRUE);
4296       }
4297       KF_TRACE(10, ("after monitor thread has started\n"));
4298 #endif
4299     }
4300     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4301   }
4302 #endif
4303 
4304   KMP_MB();
4305   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4306     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4307   }
4308 
4309   /* allocate space for it. */
4310   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4311 
4312   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4313 
4314   if (__kmp_storage_map) {
4315     __kmp_print_thread_storage_map(new_thr, new_gtid);
4316   }
4317 
4318   // add the reserve serialized team, initialized from the team's master thread
4319   {
4320     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4321     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4322     new_thr->th.th_serial_team = serial_team =
4323         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4324 #if OMPT_SUPPORT
4325                                           ompt_data_none, // root parallel id
4326 #endif
4327 #if OMP_40_ENABLED
4328                                           proc_bind_default,
4329 #endif
4330                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4331   }
4332   KMP_ASSERT(serial_team);
4333   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4334   // execution (it is unused for now).
4335   serial_team->t.t_threads[0] = new_thr;
4336   KF_TRACE(10,
4337            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4338             new_thr));
4339 
4340   /* setup the thread structures */
4341   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4342 
4343 #if USE_FAST_MEMORY
4344   __kmp_initialize_fast_memory(new_thr);
4345 #endif /* USE_FAST_MEMORY */
4346 
4347 #if KMP_USE_BGET
4348   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4349   __kmp_initialize_bget(new_thr);
4350 #endif
4351 
4352   __kmp_init_random(new_thr); // Initialize random number generator
4353 
4354   /* Initialize these only once when thread is grabbed for a team allocation */
4355   KA_TRACE(20,
4356            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4357             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4358 
4359   int b;
4360   kmp_balign_t *balign = new_thr->th.th_bar;
4361   for (b = 0; b < bs_last_barrier; ++b) {
4362     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4363     balign[b].bb.team = NULL;
4364     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4365     balign[b].bb.use_oncore_barrier = 0;
4366   }
4367 
4368   new_thr->th.th_spin_here = FALSE;
4369   new_thr->th.th_next_waiting = 0;
4370 
4371 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4372   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4373   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4374   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4375   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4376 #endif
4377 
4378   TCW_4(new_thr->th.th_in_pool, FALSE);
4379   new_thr->th.th_active_in_pool = FALSE;
4380   TCW_4(new_thr->th.th_active, TRUE);
4381 
4382   /* adjust the global counters */
4383   __kmp_all_nth++;
4384   __kmp_nth++;
4385 
4386   root->r.r_cg_nthreads++;
4387 
4388   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4389   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4390   if (__kmp_adjust_gtid_mode) {
4391     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4392       if (TCR_4(__kmp_gtid_mode) != 2) {
4393         TCW_4(__kmp_gtid_mode, 2);
4394       }
4395     } else {
4396       if (TCR_4(__kmp_gtid_mode) != 1) {
4397         TCW_4(__kmp_gtid_mode, 1);
4398       }
4399     }
4400   }
4401 
4402 #ifdef KMP_ADJUST_BLOCKTIME
4403   /* Adjust blocktime back to zero if necessary       */
4404   /* Middle initialization might not have occurred yet */
4405   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4406     if (__kmp_nth > __kmp_avail_proc) {
4407       __kmp_zero_bt = TRUE;
4408     }
4409   }
4410 #endif /* KMP_ADJUST_BLOCKTIME */
4411 
4412   /* actually fork it and create the new worker thread */
4413   KF_TRACE(
4414       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4415   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4416   KF_TRACE(10,
4417            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4418 
4419   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4420                 new_gtid));
4421   KMP_MB();
4422   return new_thr;
4423 }
4424 
4425 /* Reinitialize team for reuse.
4426    The hot team code calls this case at every fork barrier, so EPCC barrier
4427    test are extremely sensitive to changes in it, esp. writes to the team
4428    struct, which cause a cache invalidation in all threads.
4429    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4430 static void __kmp_reinitialize_team(kmp_team_t *team,
4431                                     kmp_internal_control_t *new_icvs,
4432                                     ident_t *loc) {
4433   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4434                 team->t.t_threads[0], team));
4435   KMP_DEBUG_ASSERT(team && new_icvs);
4436   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4437   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4438 
4439   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4440   // Copy ICVs to the master thread's implicit taskdata
4441   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4442   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4443 
4444   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4445                 team->t.t_threads[0], team));
4446 }
4447 
4448 /* Initialize the team data structure.
4449    This assumes the t_threads and t_max_nproc are already set.
4450    Also, we don't touch the arguments */
4451 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4452                                   kmp_internal_control_t *new_icvs,
4453                                   ident_t *loc) {
4454   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4455 
4456   /* verify */
4457   KMP_DEBUG_ASSERT(team);
4458   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4459   KMP_DEBUG_ASSERT(team->t.t_threads);
4460   KMP_MB();
4461 
4462   team->t.t_master_tid = 0; /* not needed */
4463   /* team->t.t_master_bar;        not needed */
4464   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4465   team->t.t_nproc = new_nproc;
4466 
4467   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4468   team->t.t_next_pool = NULL;
4469   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4470    * up hot team */
4471 
4472   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4473   team->t.t_invoke = NULL; /* not needed */
4474 
4475   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4476   team->t.t_sched = new_icvs->sched;
4477 
4478 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4479   team->t.t_fp_control_saved = FALSE; /* not needed */
4480   team->t.t_x87_fpu_control_word = 0; /* not needed */
4481   team->t.t_mxcsr = 0; /* not needed */
4482 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4483 
4484   team->t.t_construct = 0;
4485 
4486   team->t.t_ordered.dt.t_value = 0;
4487   team->t.t_master_active = FALSE;
4488 
4489   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4490 
4491 #ifdef KMP_DEBUG
4492   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4493 #endif
4494   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4495 
4496   team->t.t_control_stack_top = NULL;
4497 
4498   __kmp_reinitialize_team(team, new_icvs, loc);
4499 
4500   KMP_MB();
4501   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4502 }
4503 
4504 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4505 /* Sets full mask for thread and returns old mask, no changes to structures. */
4506 static void
4507 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4508   if (KMP_AFFINITY_CAPABLE()) {
4509     int status;
4510     if (old_mask != NULL) {
4511       status = __kmp_get_system_affinity(old_mask, TRUE);
4512       int error = errno;
4513       if (status != 0) {
4514         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4515                     __kmp_msg_null);
4516       }
4517     }
4518     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4519   }
4520 }
4521 #endif
4522 
4523 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4524 
4525 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4526 // It calculats the worker + master thread's partition based upon the parent
4527 // thread's partition, and binds each worker to a thread in their partition.
4528 // The master thread's partition should already include its current binding.
4529 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4530   // Copy the master thread's place partion to the team struct
4531   kmp_info_t *master_th = team->t.t_threads[0];
4532   KMP_DEBUG_ASSERT(master_th != NULL);
4533   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4534   int first_place = master_th->th.th_first_place;
4535   int last_place = master_th->th.th_last_place;
4536   int masters_place = master_th->th.th_current_place;
4537   team->t.t_first_place = first_place;
4538   team->t.t_last_place = last_place;
4539 
4540   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4541                 "bound to place %d partition = [%d,%d]\n",
4542                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4543                 team->t.t_id, masters_place, first_place, last_place));
4544 
4545   switch (proc_bind) {
4546 
4547   case proc_bind_default:
4548     // serial teams might have the proc_bind policy set to proc_bind_default. It
4549     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4550     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4551     break;
4552 
4553   case proc_bind_master: {
4554     int f;
4555     int n_th = team->t.t_nproc;
4556     for (f = 1; f < n_th; f++) {
4557       kmp_info_t *th = team->t.t_threads[f];
4558       KMP_DEBUG_ASSERT(th != NULL);
4559       th->th.th_first_place = first_place;
4560       th->th.th_last_place = last_place;
4561       th->th.th_new_place = masters_place;
4562 
4563       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4564                      "partition = [%d,%d]\n",
4565                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4566                      f, masters_place, first_place, last_place));
4567     }
4568   } break;
4569 
4570   case proc_bind_close: {
4571     int f;
4572     int n_th = team->t.t_nproc;
4573     int n_places;
4574     if (first_place <= last_place) {
4575       n_places = last_place - first_place + 1;
4576     } else {
4577       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4578     }
4579     if (n_th <= n_places) {
4580       int place = masters_place;
4581       for (f = 1; f < n_th; f++) {
4582         kmp_info_t *th = team->t.t_threads[f];
4583         KMP_DEBUG_ASSERT(th != NULL);
4584 
4585         if (place == last_place) {
4586           place = first_place;
4587         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4588           place = 0;
4589         } else {
4590           place++;
4591         }
4592         th->th.th_first_place = first_place;
4593         th->th.th_last_place = last_place;
4594         th->th.th_new_place = place;
4595 
4596         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4597                        "partition = [%d,%d]\n",
4598                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4599                        team->t.t_id, f, place, first_place, last_place));
4600       }
4601     } else {
4602       int S, rem, gap, s_count;
4603       S = n_th / n_places;
4604       s_count = 0;
4605       rem = n_th - (S * n_places);
4606       gap = rem > 0 ? n_places / rem : n_places;
4607       int place = masters_place;
4608       int gap_ct = gap;
4609       for (f = 0; f < n_th; f++) {
4610         kmp_info_t *th = team->t.t_threads[f];
4611         KMP_DEBUG_ASSERT(th != NULL);
4612 
4613         th->th.th_first_place = first_place;
4614         th->th.th_last_place = last_place;
4615         th->th.th_new_place = place;
4616         s_count++;
4617 
4618         if ((s_count == S) && rem && (gap_ct == gap)) {
4619           // do nothing, add an extra thread to place on next iteration
4620         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4621           // we added an extra thread to this place; move to next place
4622           if (place == last_place) {
4623             place = first_place;
4624           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4625             place = 0;
4626           } else {
4627             place++;
4628           }
4629           s_count = 0;
4630           gap_ct = 1;
4631           rem--;
4632         } else if (s_count == S) { // place full; don't add extra
4633           if (place == last_place) {
4634             place = first_place;
4635           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4636             place = 0;
4637           } else {
4638             place++;
4639           }
4640           gap_ct++;
4641           s_count = 0;
4642         }
4643 
4644         KA_TRACE(100,
4645                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4646                   "partition = [%d,%d]\n",
4647                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4648                   th->th.th_new_place, first_place, last_place));
4649       }
4650       KMP_DEBUG_ASSERT(place == masters_place);
4651     }
4652   } break;
4653 
4654   case proc_bind_spread: {
4655     int f;
4656     int n_th = team->t.t_nproc;
4657     int n_places;
4658     int thidx;
4659     if (first_place <= last_place) {
4660       n_places = last_place - first_place + 1;
4661     } else {
4662       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4663     }
4664     if (n_th <= n_places) {
4665       int place = -1;
4666 
4667       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4668         int S = n_places / n_th;
4669         int s_count, rem, gap, gap_ct;
4670 
4671         place = masters_place;
4672         rem = n_places - n_th * S;
4673         gap = rem ? n_th / rem : 1;
4674         gap_ct = gap;
4675         thidx = n_th;
4676         if (update_master_only == 1)
4677           thidx = 1;
4678         for (f = 0; f < thidx; f++) {
4679           kmp_info_t *th = team->t.t_threads[f];
4680           KMP_DEBUG_ASSERT(th != NULL);
4681 
4682           th->th.th_first_place = place;
4683           th->th.th_new_place = place;
4684           s_count = 1;
4685           while (s_count < S) {
4686             if (place == last_place) {
4687               place = first_place;
4688             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4689               place = 0;
4690             } else {
4691               place++;
4692             }
4693             s_count++;
4694           }
4695           if (rem && (gap_ct == gap)) {
4696             if (place == last_place) {
4697               place = first_place;
4698             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4699               place = 0;
4700             } else {
4701               place++;
4702             }
4703             rem--;
4704             gap_ct = 0;
4705           }
4706           th->th.th_last_place = place;
4707           gap_ct++;
4708 
4709           if (place == last_place) {
4710             place = first_place;
4711           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4712             place = 0;
4713           } else {
4714             place++;
4715           }
4716 
4717           KA_TRACE(100,
4718                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4719                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4720                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4721                     f, th->th.th_new_place, th->th.th_first_place,
4722                     th->th.th_last_place, __kmp_affinity_num_masks));
4723         }
4724       } else {
4725         /* Having uniform space of available computation places I can create
4726            T partitions of round(P/T) size and put threads into the first
4727            place of each partition. */
4728         double current = static_cast<double>(masters_place);
4729         double spacing =
4730             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4731         int first, last;
4732         kmp_info_t *th;
4733 
4734         thidx = n_th + 1;
4735         if (update_master_only == 1)
4736           thidx = 1;
4737         for (f = 0; f < thidx; f++) {
4738           first = static_cast<int>(current);
4739           last = static_cast<int>(current + spacing) - 1;
4740           KMP_DEBUG_ASSERT(last >= first);
4741           if (first >= n_places) {
4742             if (masters_place) {
4743               first -= n_places;
4744               last -= n_places;
4745               if (first == (masters_place + 1)) {
4746                 KMP_DEBUG_ASSERT(f == n_th);
4747                 first--;
4748               }
4749               if (last == masters_place) {
4750                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4751                 last--;
4752               }
4753             } else {
4754               KMP_DEBUG_ASSERT(f == n_th);
4755               first = 0;
4756               last = 0;
4757             }
4758           }
4759           if (last >= n_places) {
4760             last = (n_places - 1);
4761           }
4762           place = first;
4763           current += spacing;
4764           if (f < n_th) {
4765             KMP_DEBUG_ASSERT(0 <= first);
4766             KMP_DEBUG_ASSERT(n_places > first);
4767             KMP_DEBUG_ASSERT(0 <= last);
4768             KMP_DEBUG_ASSERT(n_places > last);
4769             KMP_DEBUG_ASSERT(last_place >= first_place);
4770             th = team->t.t_threads[f];
4771             KMP_DEBUG_ASSERT(th);
4772             th->th.th_first_place = first;
4773             th->th.th_new_place = place;
4774             th->th.th_last_place = last;
4775 
4776             KA_TRACE(100,
4777                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4778                       "partition = [%d,%d], spacing = %.4f\n",
4779                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4780                       team->t.t_id, f, th->th.th_new_place,
4781                       th->th.th_first_place, th->th.th_last_place, spacing));
4782           }
4783         }
4784       }
4785       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4786     } else {
4787       int S, rem, gap, s_count;
4788       S = n_th / n_places;
4789       s_count = 0;
4790       rem = n_th - (S * n_places);
4791       gap = rem > 0 ? n_places / rem : n_places;
4792       int place = masters_place;
4793       int gap_ct = gap;
4794       thidx = n_th;
4795       if (update_master_only == 1)
4796         thidx = 1;
4797       for (f = 0; f < thidx; f++) {
4798         kmp_info_t *th = team->t.t_threads[f];
4799         KMP_DEBUG_ASSERT(th != NULL);
4800 
4801         th->th.th_first_place = place;
4802         th->th.th_last_place = place;
4803         th->th.th_new_place = place;
4804         s_count++;
4805 
4806         if ((s_count == S) && rem && (gap_ct == gap)) {
4807           // do nothing, add an extra thread to place on next iteration
4808         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4809           // we added an extra thread to this place; move on to next place
4810           if (place == last_place) {
4811             place = first_place;
4812           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4813             place = 0;
4814           } else {
4815             place++;
4816           }
4817           s_count = 0;
4818           gap_ct = 1;
4819           rem--;
4820         } else if (s_count == S) { // place is full; don't add extra thread
4821           if (place == last_place) {
4822             place = first_place;
4823           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4824             place = 0;
4825           } else {
4826             place++;
4827           }
4828           gap_ct++;
4829           s_count = 0;
4830         }
4831 
4832         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4833                        "partition = [%d,%d]\n",
4834                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4835                        team->t.t_id, f, th->th.th_new_place,
4836                        th->th.th_first_place, th->th.th_last_place));
4837       }
4838       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4839     }
4840   } break;
4841 
4842   default:
4843     break;
4844   }
4845 
4846   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4847 }
4848 
4849 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4850 
4851 /* allocate a new team data structure to use.  take one off of the free pool if
4852    available */
4853 kmp_team_t *
4854 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4855 #if OMPT_SUPPORT
4856                     ompt_data_t ompt_parallel_data,
4857 #endif
4858 #if OMP_40_ENABLED
4859                     kmp_proc_bind_t new_proc_bind,
4860 #endif
4861                     kmp_internal_control_t *new_icvs,
4862                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4863   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4864   int f;
4865   kmp_team_t *team;
4866   int use_hot_team = !root->r.r_active;
4867   int level = 0;
4868 
4869   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4870   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4871   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4872   KMP_MB();
4873 
4874 #if KMP_NESTED_HOT_TEAMS
4875   kmp_hot_team_ptr_t *hot_teams;
4876   if (master) {
4877     team = master->th.th_team;
4878     level = team->t.t_active_level;
4879     if (master->th.th_teams_microtask) { // in teams construct?
4880       if (master->th.th_teams_size.nteams > 1 &&
4881           ( // #teams > 1
4882               team->t.t_pkfn ==
4883                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4884               master->th.th_teams_level <
4885                   team->t.t_level)) { // or nested parallel inside the teams
4886         ++level; // not increment if #teams==1, or for outer fork of the teams;
4887         // increment otherwise
4888       }
4889     }
4890     hot_teams = master->th.th_hot_teams;
4891     if (level < __kmp_hot_teams_max_level && hot_teams &&
4892         hot_teams[level]
4893             .hot_team) { // hot team has already been allocated for given level
4894       use_hot_team = 1;
4895     } else {
4896       use_hot_team = 0;
4897     }
4898   }
4899 #endif
4900   // Optimization to use a "hot" team
4901   if (use_hot_team && new_nproc > 1) {
4902     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4903 #if KMP_NESTED_HOT_TEAMS
4904     team = hot_teams[level].hot_team;
4905 #else
4906     team = root->r.r_hot_team;
4907 #endif
4908 #if KMP_DEBUG
4909     if (__kmp_tasking_mode != tskm_immediate_exec) {
4910       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4911                     "task_team[1] = %p before reinit\n",
4912                     team->t.t_task_team[0], team->t.t_task_team[1]));
4913     }
4914 #endif
4915 
4916     // Has the number of threads changed?
4917     /* Let's assume the most common case is that the number of threads is
4918        unchanged, and put that case first. */
4919     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4920       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4921       // This case can mean that omp_set_num_threads() was called and the hot
4922       // team size was already reduced, so we check the special flag
4923       if (team->t.t_size_changed == -1) {
4924         team->t.t_size_changed = 1;
4925       } else {
4926         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4927       }
4928 
4929       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4930       kmp_r_sched_t new_sched = new_icvs->sched;
4931       if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4932           team->t.t_sched.chunk != new_sched.chunk)
4933         team->t.t_sched =
4934             new_sched; // set master's schedule as new run-time schedule
4935 
4936       __kmp_reinitialize_team(team, new_icvs,
4937                               root->r.r_uber_thread->th.th_ident);
4938 
4939       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4940                     team->t.t_threads[0], team));
4941       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4942 
4943 #if OMP_40_ENABLED
4944 #if KMP_AFFINITY_SUPPORTED
4945       if ((team->t.t_size_changed == 0) &&
4946           (team->t.t_proc_bind == new_proc_bind)) {
4947         if (new_proc_bind == proc_bind_spread) {
4948           __kmp_partition_places(
4949               team, 1); // add flag to update only master for spread
4950         }
4951         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4952                        "proc_bind = %d, partition = [%d,%d]\n",
4953                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4954                        team->t.t_last_place));
4955       } else {
4956         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4957         __kmp_partition_places(team);
4958       }
4959 #else
4960       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4961 #endif /* KMP_AFFINITY_SUPPORTED */
4962 #endif /* OMP_40_ENABLED */
4963     } else if (team->t.t_nproc > new_nproc) {
4964       KA_TRACE(20,
4965                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4966                 new_nproc));
4967 
4968       team->t.t_size_changed = 1;
4969 #if KMP_NESTED_HOT_TEAMS
4970       if (__kmp_hot_teams_mode == 0) {
4971         // AC: saved number of threads should correspond to team's value in this
4972         // mode, can be bigger in mode 1, when hot team has threads in reserve
4973         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4974         hot_teams[level].hot_team_nth = new_nproc;
4975 #endif // KMP_NESTED_HOT_TEAMS
4976         /* release the extra threads we don't need any more */
4977         for (f = new_nproc; f < team->t.t_nproc; f++) {
4978           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4979           if (__kmp_tasking_mode != tskm_immediate_exec) {
4980             // When decreasing team size, threads no longer in the team should
4981             // unref task team.
4982             team->t.t_threads[f]->th.th_task_team = NULL;
4983           }
4984           __kmp_free_thread(team->t.t_threads[f]);
4985           team->t.t_threads[f] = NULL;
4986         }
4987 #if KMP_NESTED_HOT_TEAMS
4988       } // (__kmp_hot_teams_mode == 0)
4989       else {
4990         // When keeping extra threads in team, switch threads to wait on own
4991         // b_go flag
4992         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4993           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4994           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4995           for (int b = 0; b < bs_last_barrier; ++b) {
4996             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4997               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4998             }
4999             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5000           }
5001         }
5002       }
5003 #endif // KMP_NESTED_HOT_TEAMS
5004       team->t.t_nproc = new_nproc;
5005       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5006       if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
5007           team->t.t_sched.chunk != new_icvs->sched.chunk)
5008         team->t.t_sched = new_icvs->sched;
5009       __kmp_reinitialize_team(team, new_icvs,
5010                               root->r.r_uber_thread->th.th_ident);
5011 
5012       /* update the remaining threads */
5013       for (f = 0; f < new_nproc; ++f) {
5014         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5015       }
5016       // restore the current task state of the master thread: should be the
5017       // implicit task
5018       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5019                     team->t.t_threads[0], team));
5020 
5021       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5022 
5023 #ifdef KMP_DEBUG
5024       for (f = 0; f < team->t.t_nproc; f++) {
5025         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5026                          team->t.t_threads[f]->th.th_team_nproc ==
5027                              team->t.t_nproc);
5028       }
5029 #endif
5030 
5031 #if OMP_40_ENABLED
5032       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5033 #if KMP_AFFINITY_SUPPORTED
5034       __kmp_partition_places(team);
5035 #endif
5036 #endif
5037     } else { // team->t.t_nproc < new_nproc
5038 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5039       kmp_affin_mask_t *old_mask;
5040       if (KMP_AFFINITY_CAPABLE()) {
5041         KMP_CPU_ALLOC(old_mask);
5042       }
5043 #endif
5044 
5045       KA_TRACE(20,
5046                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5047                 new_nproc));
5048 
5049       team->t.t_size_changed = 1;
5050 
5051 #if KMP_NESTED_HOT_TEAMS
5052       int avail_threads = hot_teams[level].hot_team_nth;
5053       if (new_nproc < avail_threads)
5054         avail_threads = new_nproc;
5055       kmp_info_t **other_threads = team->t.t_threads;
5056       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5057         // Adjust barrier data of reserved threads (if any) of the team
5058         // Other data will be set in __kmp_initialize_info() below.
5059         int b;
5060         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5061         for (b = 0; b < bs_last_barrier; ++b) {
5062           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5063           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5064 #if USE_DEBUGGER
5065           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5066 #endif
5067         }
5068       }
5069       if (hot_teams[level].hot_team_nth >= new_nproc) {
5070         // we have all needed threads in reserve, no need to allocate any
5071         // this only possible in mode 1, cannot have reserved threads in mode 0
5072         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5073         team->t.t_nproc = new_nproc; // just get reserved threads involved
5074       } else {
5075         // we may have some threads in reserve, but not enough
5076         team->t.t_nproc =
5077             hot_teams[level]
5078                 .hot_team_nth; // get reserved threads involved if any
5079         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5080 #endif // KMP_NESTED_HOT_TEAMS
5081         if (team->t.t_max_nproc < new_nproc) {
5082           /* reallocate larger arrays */
5083           __kmp_reallocate_team_arrays(team, new_nproc);
5084           __kmp_reinitialize_team(team, new_icvs, NULL);
5085         }
5086 
5087 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5088         /* Temporarily set full mask for master thread before creation of
5089            workers. The reason is that workers inherit the affinity from master,
5090            so if a lot of workers are created on the single core quickly, they
5091            don't get a chance to set their own affinity for a long time. */
5092         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5093 #endif
5094 
5095         /* allocate new threads for the hot team */
5096         for (f = team->t.t_nproc; f < new_nproc; f++) {
5097           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5098           KMP_DEBUG_ASSERT(new_worker);
5099           team->t.t_threads[f] = new_worker;
5100 
5101           KA_TRACE(20,
5102                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5103                     "join=%llu, plain=%llu\n",
5104                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5105                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5106                     team->t.t_bar[bs_plain_barrier].b_arrived));
5107 
5108           { // Initialize barrier data for new threads.
5109             int b;
5110             kmp_balign_t *balign = new_worker->th.th_bar;
5111             for (b = 0; b < bs_last_barrier; ++b) {
5112               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5113               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5114                                KMP_BARRIER_PARENT_FLAG);
5115 #if USE_DEBUGGER
5116               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5117 #endif
5118             }
5119           }
5120         }
5121 
5122 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5123         if (KMP_AFFINITY_CAPABLE()) {
5124           /* Restore initial master thread's affinity mask */
5125           __kmp_set_system_affinity(old_mask, TRUE);
5126           KMP_CPU_FREE(old_mask);
5127         }
5128 #endif
5129 #if KMP_NESTED_HOT_TEAMS
5130       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5131 #endif // KMP_NESTED_HOT_TEAMS
5132       /* make sure everyone is syncronized */
5133       int old_nproc = team->t.t_nproc; // save old value and use to update only
5134       // new threads below
5135       __kmp_initialize_team(team, new_nproc, new_icvs,
5136                             root->r.r_uber_thread->th.th_ident);
5137 
5138       /* reinitialize the threads */
5139       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5140       for (f = 0; f < team->t.t_nproc; ++f)
5141         __kmp_initialize_info(team->t.t_threads[f], team, f,
5142                               __kmp_gtid_from_tid(f, team));
5143       if (level) { // set th_task_state for new threads in nested hot team
5144         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5145         // only need to set the th_task_state for the new threads. th_task_state
5146         // for master thread will not be accurate until after this in
5147         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5148         // correct value.
5149         for (f = old_nproc; f < team->t.t_nproc; ++f)
5150           team->t.t_threads[f]->th.th_task_state =
5151               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5152       } else { // set th_task_state for new threads in non-nested hot team
5153         int old_state =
5154             team->t.t_threads[0]->th.th_task_state; // copy master's state
5155         for (f = old_nproc; f < team->t.t_nproc; ++f)
5156           team->t.t_threads[f]->th.th_task_state = old_state;
5157       }
5158 
5159 #ifdef KMP_DEBUG
5160       for (f = 0; f < team->t.t_nproc; ++f) {
5161         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5162                          team->t.t_threads[f]->th.th_team_nproc ==
5163                              team->t.t_nproc);
5164       }
5165 #endif
5166 
5167 #if OMP_40_ENABLED
5168       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5169 #if KMP_AFFINITY_SUPPORTED
5170       __kmp_partition_places(team);
5171 #endif
5172 #endif
5173     } // Check changes in number of threads
5174 
5175 #if OMP_40_ENABLED
5176     kmp_info_t *master = team->t.t_threads[0];
5177     if (master->th.th_teams_microtask) {
5178       for (f = 1; f < new_nproc; ++f) {
5179         // propagate teams construct specific info to workers
5180         kmp_info_t *thr = team->t.t_threads[f];
5181         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5182         thr->th.th_teams_level = master->th.th_teams_level;
5183         thr->th.th_teams_size = master->th.th_teams_size;
5184       }
5185     }
5186 #endif /* OMP_40_ENABLED */
5187 #if KMP_NESTED_HOT_TEAMS
5188     if (level) {
5189       // Sync barrier state for nested hot teams, not needed for outermost hot
5190       // team.
5191       for (f = 1; f < new_nproc; ++f) {
5192         kmp_info_t *thr = team->t.t_threads[f];
5193         int b;
5194         kmp_balign_t *balign = thr->th.th_bar;
5195         for (b = 0; b < bs_last_barrier; ++b) {
5196           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5197           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5198 #if USE_DEBUGGER
5199           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5200 #endif
5201         }
5202       }
5203     }
5204 #endif // KMP_NESTED_HOT_TEAMS
5205 
5206     /* reallocate space for arguments if necessary */
5207     __kmp_alloc_argv_entries(argc, team, TRUE);
5208     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5209     // The hot team re-uses the previous task team,
5210     // if untouched during the previous release->gather phase.
5211 
5212     KF_TRACE(10, (" hot_team = %p\n", team));
5213 
5214 #if KMP_DEBUG
5215     if (__kmp_tasking_mode != tskm_immediate_exec) {
5216       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5217                     "task_team[1] = %p after reinit\n",
5218                     team->t.t_task_team[0], team->t.t_task_team[1]));
5219     }
5220 #endif
5221 
5222 #if OMPT_SUPPORT
5223     __ompt_team_assign_id(team, ompt_parallel_data);
5224 #endif
5225 
5226     KMP_MB();
5227 
5228     return team;
5229   }
5230 
5231   /* next, let's try to take one from the team pool */
5232   KMP_MB();
5233   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5234     /* TODO: consider resizing undersized teams instead of reaping them, now
5235        that we have a resizing mechanism */
5236     if (team->t.t_max_nproc >= max_nproc) {
5237       /* take this team from the team pool */
5238       __kmp_team_pool = team->t.t_next_pool;
5239 
5240       /* setup the team for fresh use */
5241       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5242 
5243       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5244                     "task_team[1] %p to NULL\n",
5245                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5246       team->t.t_task_team[0] = NULL;
5247       team->t.t_task_team[1] = NULL;
5248 
5249       /* reallocate space for arguments if necessary */
5250       __kmp_alloc_argv_entries(argc, team, TRUE);
5251       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5252 
5253       KA_TRACE(
5254           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5255                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5256       { // Initialize barrier data.
5257         int b;
5258         for (b = 0; b < bs_last_barrier; ++b) {
5259           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5260 #if USE_DEBUGGER
5261           team->t.t_bar[b].b_master_arrived = 0;
5262           team->t.t_bar[b].b_team_arrived = 0;
5263 #endif
5264         }
5265       }
5266 
5267 #if OMP_40_ENABLED
5268       team->t.t_proc_bind = new_proc_bind;
5269 #endif
5270 
5271       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5272                     team->t.t_id));
5273 
5274 #if OMPT_SUPPORT
5275       __ompt_team_assign_id(team, ompt_parallel_data);
5276 #endif
5277 
5278       KMP_MB();
5279 
5280       return team;
5281     }
5282 
5283     /* reap team if it is too small, then loop back and check the next one */
5284     // not sure if this is wise, but, will be redone during the hot-teams
5285     // rewrite.
5286     /* TODO: Use technique to find the right size hot-team, don't reap them */
5287     team = __kmp_reap_team(team);
5288     __kmp_team_pool = team;
5289   }
5290 
5291   /* nothing available in the pool, no matter, make a new team! */
5292   KMP_MB();
5293   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5294 
5295   /* and set it up */
5296   team->t.t_max_nproc = max_nproc;
5297   /* NOTE well, for some reason allocating one big buffer and dividing it up
5298      seems to really hurt performance a lot on the P4, so, let's not use this */
5299   __kmp_allocate_team_arrays(team, max_nproc);
5300 
5301   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5302   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5303 
5304   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5305                 "%p to NULL\n",
5306                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5307   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5308   // memory, no need to duplicate
5309   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5310   // memory, no need to duplicate
5311 
5312   if (__kmp_storage_map) {
5313     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5314   }
5315 
5316   /* allocate space for arguments */
5317   __kmp_alloc_argv_entries(argc, team, FALSE);
5318   team->t.t_argc = argc;
5319 
5320   KA_TRACE(20,
5321            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5322             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5323   { // Initialize barrier data.
5324     int b;
5325     for (b = 0; b < bs_last_barrier; ++b) {
5326       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5327 #if USE_DEBUGGER
5328       team->t.t_bar[b].b_master_arrived = 0;
5329       team->t.t_bar[b].b_team_arrived = 0;
5330 #endif
5331     }
5332   }
5333 
5334 #if OMP_40_ENABLED
5335   team->t.t_proc_bind = new_proc_bind;
5336 #endif
5337 
5338 #if OMPT_SUPPORT
5339   __ompt_team_assign_id(team, ompt_parallel_data);
5340   team->t.ompt_serialized_team_info = NULL;
5341 #endif
5342 
5343   KMP_MB();
5344 
5345   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5346                 team->t.t_id));
5347 
5348   return team;
5349 }
5350 
5351 /* TODO implement hot-teams at all levels */
5352 /* TODO implement lazy thread release on demand (disband request) */
5353 
5354 /* free the team.  return it to the team pool.  release all the threads
5355  * associated with it */
5356 void __kmp_free_team(kmp_root_t *root,
5357                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5358   int f;
5359   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5360                 team->t.t_id));
5361 
5362   /* verify state */
5363   KMP_DEBUG_ASSERT(root);
5364   KMP_DEBUG_ASSERT(team);
5365   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5366   KMP_DEBUG_ASSERT(team->t.t_threads);
5367 
5368   int use_hot_team = team == root->r.r_hot_team;
5369 #if KMP_NESTED_HOT_TEAMS
5370   int level;
5371   kmp_hot_team_ptr_t *hot_teams;
5372   if (master) {
5373     level = team->t.t_active_level - 1;
5374     if (master->th.th_teams_microtask) { // in teams construct?
5375       if (master->th.th_teams_size.nteams > 1) {
5376         ++level; // level was not increased in teams construct for
5377         // team_of_masters
5378       }
5379       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5380           master->th.th_teams_level == team->t.t_level) {
5381         ++level; // level was not increased in teams construct for
5382         // team_of_workers before the parallel
5383       } // team->t.t_level will be increased inside parallel
5384     }
5385     hot_teams = master->th.th_hot_teams;
5386     if (level < __kmp_hot_teams_max_level) {
5387       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5388       use_hot_team = 1;
5389     }
5390   }
5391 #endif // KMP_NESTED_HOT_TEAMS
5392 
5393   /* team is done working */
5394   TCW_SYNC_PTR(team->t.t_pkfn,
5395                NULL); // Important for Debugging Support Library.
5396   team->t.t_copyin_counter = 0; // init counter for possible reuse
5397   // Do not reset pointer to parent team to NULL for hot teams.
5398 
5399   /* if we are non-hot team, release our threads */
5400   if (!use_hot_team) {
5401     if (__kmp_tasking_mode != tskm_immediate_exec) {
5402       // Wait for threads to reach reapable state
5403       for (f = 1; f < team->t.t_nproc; ++f) {
5404         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5405         kmp_info_t *th = team->t.t_threads[f];
5406         volatile kmp_uint32 *state = &th->th.th_reap_state;
5407         while (*state != KMP_SAFE_TO_REAP) {
5408 #if KMP_OS_WINDOWS
5409           // On Windows a thread can be killed at any time, check this
5410           DWORD ecode;
5411           if (!__kmp_is_thread_alive(th, &ecode)) {
5412             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5413             break;
5414           }
5415 #endif
5416           // first check if thread is sleeping
5417           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5418           if (fl.is_sleeping())
5419             fl.resume(__kmp_gtid_from_thread(th));
5420           KMP_CPU_PAUSE();
5421         }
5422       }
5423 
5424       // Delete task teams
5425       int tt_idx;
5426       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5427         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5428         if (task_team != NULL) {
5429           for (f = 0; f < team->t.t_nproc;
5430                ++f) { // Have all threads unref task teams
5431             team->t.t_threads[f]->th.th_task_team = NULL;
5432           }
5433           KA_TRACE(
5434               20,
5435               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5436                __kmp_get_gtid(), task_team, team->t.t_id));
5437 #if KMP_NESTED_HOT_TEAMS
5438           __kmp_free_task_team(master, task_team);
5439 #endif
5440           team->t.t_task_team[tt_idx] = NULL;
5441         }
5442       }
5443     }
5444 
5445     // Reset pointer to parent team only for non-hot teams.
5446     team->t.t_parent = NULL;
5447     team->t.t_level = 0;
5448     team->t.t_active_level = 0;
5449 
5450     /* free the worker threads */
5451     for (f = 1; f < team->t.t_nproc; ++f) {
5452       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5453       __kmp_free_thread(team->t.t_threads[f]);
5454       team->t.t_threads[f] = NULL;
5455     }
5456 
5457     /* put the team back in the team pool */
5458     /* TODO limit size of team pool, call reap_team if pool too large */
5459     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5460     __kmp_team_pool = (volatile kmp_team_t *)team;
5461   }
5462 
5463   KMP_MB();
5464 }
5465 
5466 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5467 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5468   kmp_team_t *next_pool = team->t.t_next_pool;
5469 
5470   KMP_DEBUG_ASSERT(team);
5471   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5472   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5473   KMP_DEBUG_ASSERT(team->t.t_threads);
5474   KMP_DEBUG_ASSERT(team->t.t_argv);
5475 
5476   /* TODO clean the threads that are a part of this? */
5477 
5478   /* free stuff */
5479   __kmp_free_team_arrays(team);
5480   if (team->t.t_argv != &team->t.t_inline_argv[0])
5481     __kmp_free((void *)team->t.t_argv);
5482   __kmp_free(team);
5483 
5484   KMP_MB();
5485   return next_pool;
5486 }
5487 
5488 // Free the thread.  Don't reap it, just place it on the pool of available
5489 // threads.
5490 //
5491 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5492 // binding for the affinity mechanism to be useful.
5493 //
5494 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5495 // However, we want to avoid a potential performance problem by always
5496 // scanning through the list to find the correct point at which to insert
5497 // the thread (potential N**2 behavior).  To do this we keep track of the
5498 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5499 // With single-level parallelism, threads will always be added to the tail
5500 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5501 // parallelism, all bets are off and we may need to scan through the entire
5502 // free list.
5503 //
5504 // This change also has a potentially large performance benefit, for some
5505 // applications.  Previously, as threads were freed from the hot team, they
5506 // would be placed back on the free list in inverse order.  If the hot team
5507 // grew back to it's original size, then the freed thread would be placed
5508 // back on the hot team in reverse order.  This could cause bad cache
5509 // locality problems on programs where the size of the hot team regularly
5510 // grew and shrunk.
5511 //
5512 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5513 void __kmp_free_thread(kmp_info_t *this_th) {
5514   int gtid;
5515   kmp_info_t **scan;
5516   kmp_root_t *root = this_th->th.th_root;
5517 
5518   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5519                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5520 
5521   KMP_DEBUG_ASSERT(this_th);
5522 
5523   // When moving thread to pool, switch thread to wait on own b_go flag, and
5524   // uninitialized (NULL team).
5525   int b;
5526   kmp_balign_t *balign = this_th->th.th_bar;
5527   for (b = 0; b < bs_last_barrier; ++b) {
5528     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5529       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5530     balign[b].bb.team = NULL;
5531     balign[b].bb.leaf_kids = 0;
5532   }
5533   this_th->th.th_task_state = 0;
5534 
5535   /* put thread back on the free pool */
5536   TCW_PTR(this_th->th.th_team, NULL);
5537   TCW_PTR(this_th->th.th_root, NULL);
5538   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5539 
5540   // If the __kmp_thread_pool_insert_pt is already past the new insert
5541   // point, then we need to re-scan the entire list.
5542   gtid = this_th->th.th_info.ds.ds_gtid;
5543   if (__kmp_thread_pool_insert_pt != NULL) {
5544     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5545     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5546       __kmp_thread_pool_insert_pt = NULL;
5547     }
5548   }
5549 
5550   // Scan down the list to find the place to insert the thread.
5551   // scan is the address of a link in the list, possibly the address of
5552   // __kmp_thread_pool itself.
5553   //
5554   // In the absence of nested parallism, the for loop will have 0 iterations.
5555   if (__kmp_thread_pool_insert_pt != NULL) {
5556     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5557   } else {
5558     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5559   }
5560   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5561        scan = &((*scan)->th.th_next_pool))
5562     ;
5563 
5564   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5565   // to its address.
5566   TCW_PTR(this_th->th.th_next_pool, *scan);
5567   __kmp_thread_pool_insert_pt = *scan = this_th;
5568   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5569                    (this_th->th.th_info.ds.ds_gtid <
5570                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5571   TCW_4(this_th->th.th_in_pool, TRUE);
5572   __kmp_thread_pool_nth++;
5573 
5574   TCW_4(__kmp_nth, __kmp_nth - 1);
5575   root->r.r_cg_nthreads--;
5576 
5577 #ifdef KMP_ADJUST_BLOCKTIME
5578   /* Adjust blocktime back to user setting or default if necessary */
5579   /* Middle initialization might never have occurred                */
5580   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5581     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5582     if (__kmp_nth <= __kmp_avail_proc) {
5583       __kmp_zero_bt = FALSE;
5584     }
5585   }
5586 #endif /* KMP_ADJUST_BLOCKTIME */
5587 
5588   KMP_MB();
5589 }
5590 
5591 /* ------------------------------------------------------------------------ */
5592 
5593 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5594   int gtid = this_thr->th.th_info.ds.ds_gtid;
5595   /*    void                 *stack_data;*/
5596   kmp_team_t *(*volatile pteam);
5597 
5598   KMP_MB();
5599   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5600 
5601   if (__kmp_env_consistency_check) {
5602     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5603   }
5604 
5605 #if OMPT_SUPPORT
5606   ompt_data_t *thread_data;
5607   if (ompt_enabled.enabled) {
5608     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5609     thread_data->ptr = NULL;
5610 
5611     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5612     this_thr->th.ompt_thread_info.wait_id = 0;
5613     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5614     if (ompt_enabled.ompt_callback_thread_begin) {
5615       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5616           ompt_thread_worker, thread_data);
5617     }
5618   }
5619 #endif
5620 
5621 #if OMPT_SUPPORT
5622   if (ompt_enabled.enabled) {
5623     this_thr->th.ompt_thread_info.state = omp_state_idle;
5624   }
5625 #endif
5626   /* This is the place where threads wait for work */
5627   while (!TCR_4(__kmp_global.g.g_done)) {
5628     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5629     KMP_MB();
5630 
5631     /* wait for work to do */
5632     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5633 
5634     /* No tid yet since not part of a team */
5635     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5636 
5637 #if OMPT_SUPPORT
5638     if (ompt_enabled.enabled) {
5639       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5640     }
5641 #endif
5642 
5643     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5644 
5645     /* have we been allocated? */
5646     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5647       /* we were just woken up, so run our new task */
5648       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5649         int rc;
5650         KA_TRACE(20,
5651                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5652                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5653                   (*pteam)->t.t_pkfn));
5654 
5655         updateHWFPControl(*pteam);
5656 
5657 #if OMPT_SUPPORT
5658         if (ompt_enabled.enabled) {
5659           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5660         }
5661 #endif
5662 
5663         {
5664           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5665           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5666           rc = (*pteam)->t.t_invoke(gtid);
5667         }
5668         KMP_ASSERT(rc);
5669 
5670         KMP_MB();
5671         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5672                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5673                       (*pteam)->t.t_pkfn));
5674       }
5675 #if OMPT_SUPPORT
5676       if (ompt_enabled.enabled) {
5677         /* no frame set while outside task */
5678         __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5679 
5680         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5681         this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
5682       }
5683 #endif
5684       /* join barrier after parallel region */
5685       __kmp_join_barrier(gtid);
5686     }
5687   }
5688   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5689 
5690 #if OMPT_SUPPORT
5691   if (ompt_enabled.ompt_callback_thread_end) {
5692     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5693   }
5694 #endif
5695 
5696   this_thr->th.th_task_team = NULL;
5697   /* run the destructors for the threadprivate data for this thread */
5698   __kmp_common_destroy_gtid(gtid);
5699 
5700   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5701   KMP_MB();
5702   return this_thr;
5703 }
5704 
5705 /* ------------------------------------------------------------------------ */
5706 
5707 void __kmp_internal_end_dest(void *specific_gtid) {
5708 #if KMP_COMPILER_ICC
5709 #pragma warning(push)
5710 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5711 // significant bits
5712 #endif
5713   // Make sure no significant bits are lost
5714   int gtid = (kmp_intptr_t)specific_gtid - 1;
5715 #if KMP_COMPILER_ICC
5716 #pragma warning(pop)
5717 #endif
5718 
5719   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5720   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5721    * this is because 0 is reserved for the nothing-stored case */
5722 
5723   /* josh: One reason for setting the gtid specific data even when it is being
5724      destroyed by pthread is to allow gtid lookup through thread specific data
5725      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5726      that gets executed in the call to __kmp_internal_end_thread, actually
5727      gets the gtid through the thread specific data.  Setting it here seems
5728      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5729      to run smoothly.
5730      todo: get rid of this after we remove the dependence on
5731      __kmp_gtid_get_specific  */
5732   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5733     __kmp_gtid_set_specific(gtid);
5734 #ifdef KMP_TDATA_GTID
5735   __kmp_gtid = gtid;
5736 #endif
5737   __kmp_internal_end_thread(gtid);
5738 }
5739 
5740 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5741 
5742 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5743 // destructors work perfectly, but in real libomp.so I have no evidence it is
5744 // ever called. However, -fini linker option in makefile.mk works fine.
5745 
5746 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5747   __kmp_internal_end_atexit();
5748 }
5749 
5750 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5751 
5752 #endif
5753 
5754 /* [Windows] josh: when the atexit handler is called, there may still be more
5755    than one thread alive */
5756 void __kmp_internal_end_atexit(void) {
5757   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5758   /* [Windows]
5759      josh: ideally, we want to completely shutdown the library in this atexit
5760      handler, but stat code that depends on thread specific data for gtid fails
5761      because that data becomes unavailable at some point during the shutdown, so
5762      we call __kmp_internal_end_thread instead. We should eventually remove the
5763      dependency on __kmp_get_specific_gtid in the stat code and use
5764      __kmp_internal_end_library to cleanly shutdown the library.
5765 
5766      // TODO: Can some of this comment about GVS be removed?
5767      I suspect that the offending stat code is executed when the calling thread
5768      tries to clean up a dead root thread's data structures, resulting in GVS
5769      code trying to close the GVS structures for that thread, but since the stat
5770      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5771      the calling thread is cleaning up itself instead of another thread, it get
5772      confused. This happens because allowing a thread to unregister and cleanup
5773      another thread is a recent modification for addressing an issue.
5774      Based on the current design (20050722), a thread may end up
5775      trying to unregister another thread only if thread death does not trigger
5776      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5777      thread specific data destructor function to detect thread death. For
5778      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5779      is nothing.  Thus, the workaround is applicable only for Windows static
5780      stat library. */
5781   __kmp_internal_end_library(-1);
5782 #if KMP_OS_WINDOWS
5783   __kmp_close_console();
5784 #endif
5785 }
5786 
5787 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5788   // It is assumed __kmp_forkjoin_lock is acquired.
5789 
5790   int gtid;
5791 
5792   KMP_DEBUG_ASSERT(thread != NULL);
5793 
5794   gtid = thread->th.th_info.ds.ds_gtid;
5795 
5796   if (!is_root) {
5797 
5798     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5799       /* Assume the threads are at the fork barrier here */
5800       KA_TRACE(
5801           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5802                gtid));
5803       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5804        * (GEH) */
5805       ANNOTATE_HAPPENS_BEFORE(thread);
5806       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5807       __kmp_release_64(&flag);
5808     }
5809 
5810     // Terminate OS thread.
5811     __kmp_reap_worker(thread);
5812 
5813     // The thread was killed asynchronously.  If it was actively
5814     // spinning in the thread pool, decrement the global count.
5815     //
5816     // There is a small timing hole here - if the worker thread was just waking
5817     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5818     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5819     // the global counter might not get updated.
5820     //
5821     // Currently, this can only happen as the library is unloaded,
5822     // so there are no harmful side effects.
5823     if (thread->th.th_active_in_pool) {
5824       thread->th.th_active_in_pool = FALSE;
5825       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5826       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5827     }
5828 
5829     // Decrement # of [worker] threads in the pool.
5830     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5831     --__kmp_thread_pool_nth;
5832   }
5833 
5834   __kmp_free_implicit_task(thread);
5835 
5836 // Free the fast memory for tasking
5837 #if USE_FAST_MEMORY
5838   __kmp_free_fast_memory(thread);
5839 #endif /* USE_FAST_MEMORY */
5840 
5841   __kmp_suspend_uninitialize_thread(thread);
5842 
5843   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5844   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5845 
5846   --__kmp_all_nth;
5847 // __kmp_nth was decremented when thread is added to the pool.
5848 
5849 #ifdef KMP_ADJUST_BLOCKTIME
5850   /* Adjust blocktime back to user setting or default if necessary */
5851   /* Middle initialization might never have occurred                */
5852   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5853     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5854     if (__kmp_nth <= __kmp_avail_proc) {
5855       __kmp_zero_bt = FALSE;
5856     }
5857   }
5858 #endif /* KMP_ADJUST_BLOCKTIME */
5859 
5860   /* free the memory being used */
5861   if (__kmp_env_consistency_check) {
5862     if (thread->th.th_cons) {
5863       __kmp_free_cons_stack(thread->th.th_cons);
5864       thread->th.th_cons = NULL;
5865     }
5866   }
5867 
5868   if (thread->th.th_pri_common != NULL) {
5869     __kmp_free(thread->th.th_pri_common);
5870     thread->th.th_pri_common = NULL;
5871   }
5872 
5873   if (thread->th.th_task_state_memo_stack != NULL) {
5874     __kmp_free(thread->th.th_task_state_memo_stack);
5875     thread->th.th_task_state_memo_stack = NULL;
5876   }
5877 
5878 #if KMP_USE_BGET
5879   if (thread->th.th_local.bget_data != NULL) {
5880     __kmp_finalize_bget(thread);
5881   }
5882 #endif
5883 
5884 #if KMP_AFFINITY_SUPPORTED
5885   if (thread->th.th_affin_mask != NULL) {
5886     KMP_CPU_FREE(thread->th.th_affin_mask);
5887     thread->th.th_affin_mask = NULL;
5888   }
5889 #endif /* KMP_AFFINITY_SUPPORTED */
5890 
5891   __kmp_reap_team(thread->th.th_serial_team);
5892   thread->th.th_serial_team = NULL;
5893   __kmp_free(thread);
5894 
5895   KMP_MB();
5896 
5897 } // __kmp_reap_thread
5898 
5899 static void __kmp_internal_end(void) {
5900   int i;
5901 
5902   /* First, unregister the library */
5903   __kmp_unregister_library();
5904 
5905 #if KMP_OS_WINDOWS
5906   /* In Win static library, we can't tell when a root actually dies, so we
5907      reclaim the data structures for any root threads that have died but not
5908      unregistered themselves, in order to shut down cleanly.
5909      In Win dynamic library we also can't tell when a thread dies.  */
5910   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5911 // dead roots
5912 #endif
5913 
5914   for (i = 0; i < __kmp_threads_capacity; i++)
5915     if (__kmp_root[i])
5916       if (__kmp_root[i]->r.r_active)
5917         break;
5918   KMP_MB(); /* Flush all pending memory write invalidates.  */
5919   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5920 
5921   if (i < __kmp_threads_capacity) {
5922 #if KMP_USE_MONITOR
5923     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5924     KMP_MB(); /* Flush all pending memory write invalidates.  */
5925 
5926     // Need to check that monitor was initialized before reaping it. If we are
5927     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5928     // __kmp_monitor will appear to contain valid data, but it is only valid in
5929     // the parent process, not the child.
5930     // New behavior (201008): instead of keying off of the flag
5931     // __kmp_init_parallel, the monitor thread creation is keyed off
5932     // of the new flag __kmp_init_monitor.
5933     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5934     if (TCR_4(__kmp_init_monitor)) {
5935       __kmp_reap_monitor(&__kmp_monitor);
5936       TCW_4(__kmp_init_monitor, 0);
5937     }
5938     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5939     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5940 #endif // KMP_USE_MONITOR
5941   } else {
5942 /* TODO move this to cleanup code */
5943 #ifdef KMP_DEBUG
5944     /* make sure that everything has properly ended */
5945     for (i = 0; i < __kmp_threads_capacity; i++) {
5946       if (__kmp_root[i]) {
5947         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5948         //                    there can be uber threads alive here
5949         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5950       }
5951     }
5952 #endif
5953 
5954     KMP_MB();
5955 
5956     // Reap the worker threads.
5957     // This is valid for now, but be careful if threads are reaped sooner.
5958     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5959       // Get the next thread from the pool.
5960       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5961       __kmp_thread_pool = thread->th.th_next_pool;
5962       // Reap it.
5963       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5964       thread->th.th_next_pool = NULL;
5965       thread->th.th_in_pool = FALSE;
5966       __kmp_reap_thread(thread, 0);
5967     }
5968     __kmp_thread_pool_insert_pt = NULL;
5969 
5970     // Reap teams.
5971     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5972       // Get the next team from the pool.
5973       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5974       __kmp_team_pool = team->t.t_next_pool;
5975       // Reap it.
5976       team->t.t_next_pool = NULL;
5977       __kmp_reap_team(team);
5978     }
5979 
5980     __kmp_reap_task_teams();
5981 
5982     for (i = 0; i < __kmp_threads_capacity; ++i) {
5983       // TBD: Add some checking...
5984       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5985     }
5986 
5987     /* Make sure all threadprivate destructors get run by joining with all
5988        worker threads before resetting this flag */
5989     TCW_SYNC_4(__kmp_init_common, FALSE);
5990 
5991     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5992     KMP_MB();
5993 
5994 #if KMP_USE_MONITOR
5995     // See note above: One of the possible fixes for CQ138434 / CQ140126
5996     //
5997     // FIXME: push both code fragments down and CSE them?
5998     // push them into __kmp_cleanup() ?
5999     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6000     if (TCR_4(__kmp_init_monitor)) {
6001       __kmp_reap_monitor(&__kmp_monitor);
6002       TCW_4(__kmp_init_monitor, 0);
6003     }
6004     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6005     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6006 #endif
6007   } /* else !__kmp_global.t_active */
6008   TCW_4(__kmp_init_gtid, FALSE);
6009   KMP_MB(); /* Flush all pending memory write invalidates.  */
6010 
6011   __kmp_cleanup();
6012 #if OMPT_SUPPORT
6013   ompt_fini();
6014 #endif
6015 }
6016 
6017 void __kmp_internal_end_library(int gtid_req) {
6018   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6019   /* this shouldn't be a race condition because __kmp_internal_end() is the
6020      only place to clear __kmp_serial_init */
6021   /* we'll check this later too, after we get the lock */
6022   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6023   // redundaant, because the next check will work in any case.
6024   if (__kmp_global.g.g_abort) {
6025     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6026     /* TODO abort? */
6027     return;
6028   }
6029   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6030     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6031     return;
6032   }
6033 
6034   KMP_MB(); /* Flush all pending memory write invalidates.  */
6035 
6036   /* find out who we are and what we should do */
6037   {
6038     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6039     KA_TRACE(
6040         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6041     if (gtid == KMP_GTID_SHUTDOWN) {
6042       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6043                     "already shutdown\n"));
6044       return;
6045     } else if (gtid == KMP_GTID_MONITOR) {
6046       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6047                     "registered, or system shutdown\n"));
6048       return;
6049     } else if (gtid == KMP_GTID_DNE) {
6050       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6051                     "shutdown\n"));
6052       /* we don't know who we are, but we may still shutdown the library */
6053     } else if (KMP_UBER_GTID(gtid)) {
6054       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6055       if (__kmp_root[gtid]->r.r_active) {
6056         __kmp_global.g.g_abort = -1;
6057         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6058         KA_TRACE(10,
6059                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6060                   gtid));
6061         return;
6062       } else {
6063         KA_TRACE(
6064             10,
6065             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6066         __kmp_unregister_root_current_thread(gtid);
6067       }
6068     } else {
6069 /* worker threads may call this function through the atexit handler, if they
6070  * call exit() */
6071 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6072    TODO: do a thorough shutdown instead */
6073 #ifdef DUMP_DEBUG_ON_EXIT
6074       if (__kmp_debug_buf)
6075         __kmp_dump_debug_buffer();
6076 #endif
6077       return;
6078     }
6079   }
6080   /* synchronize the termination process */
6081   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6082 
6083   /* have we already finished */
6084   if (__kmp_global.g.g_abort) {
6085     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6086     /* TODO abort? */
6087     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6088     return;
6089   }
6090   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6091     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6092     return;
6093   }
6094 
6095   /* We need this lock to enforce mutex between this reading of
6096      __kmp_threads_capacity and the writing by __kmp_register_root.
6097      Alternatively, we can use a counter of roots that is atomically updated by
6098      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6099      __kmp_internal_end_*.  */
6100   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6101 
6102   /* now we can safely conduct the actual termination */
6103   __kmp_internal_end();
6104 
6105   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6106   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6107 
6108   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6109 
6110 #ifdef DUMP_DEBUG_ON_EXIT
6111   if (__kmp_debug_buf)
6112     __kmp_dump_debug_buffer();
6113 #endif
6114 
6115 #if KMP_OS_WINDOWS
6116   __kmp_close_console();
6117 #endif
6118 
6119   __kmp_fini_allocator();
6120 
6121 } // __kmp_internal_end_library
6122 
6123 void __kmp_internal_end_thread(int gtid_req) {
6124   int i;
6125 
6126   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6127   /* this shouldn't be a race condition because __kmp_internal_end() is the
6128    * only place to clear __kmp_serial_init */
6129   /* we'll check this later too, after we get the lock */
6130   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6131   // redundant, because the next check will work in any case.
6132   if (__kmp_global.g.g_abort) {
6133     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6134     /* TODO abort? */
6135     return;
6136   }
6137   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6138     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6139     return;
6140   }
6141 
6142   KMP_MB(); /* Flush all pending memory write invalidates.  */
6143 
6144   /* find out who we are and what we should do */
6145   {
6146     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6147     KA_TRACE(10,
6148              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6149     if (gtid == KMP_GTID_SHUTDOWN) {
6150       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6151                     "already shutdown\n"));
6152       return;
6153     } else if (gtid == KMP_GTID_MONITOR) {
6154       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6155                     "registered, or system shutdown\n"));
6156       return;
6157     } else if (gtid == KMP_GTID_DNE) {
6158       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6159                     "shutdown\n"));
6160       return;
6161       /* we don't know who we are */
6162     } else if (KMP_UBER_GTID(gtid)) {
6163       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6164       if (__kmp_root[gtid]->r.r_active) {
6165         __kmp_global.g.g_abort = -1;
6166         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6167         KA_TRACE(10,
6168                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6169                   gtid));
6170         return;
6171       } else {
6172         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6173                       gtid));
6174         __kmp_unregister_root_current_thread(gtid);
6175       }
6176     } else {
6177       /* just a worker thread, let's leave */
6178       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6179 
6180       if (gtid >= 0) {
6181         __kmp_threads[gtid]->th.th_task_team = NULL;
6182       }
6183 
6184       KA_TRACE(10,
6185                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6186                 gtid));
6187       return;
6188     }
6189   }
6190 #if defined KMP_DYNAMIC_LIB
6191   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6192   // thread, because we will better shutdown later in the library destructor.
6193   // The reason of this change is performance problem when non-openmp thread in
6194   // a loop forks and joins many openmp threads. We can save a lot of time
6195   // keeping worker threads alive until the program shutdown.
6196   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6197   // and Windows(DPD200287443) that occurs when using critical sections from
6198   // foreign threads.
6199   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6200   return;
6201 #endif
6202   /* synchronize the termination process */
6203   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6204 
6205   /* have we already finished */
6206   if (__kmp_global.g.g_abort) {
6207     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6208     /* TODO abort? */
6209     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6210     return;
6211   }
6212   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6213     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6214     return;
6215   }
6216 
6217   /* We need this lock to enforce mutex between this reading of
6218      __kmp_threads_capacity and the writing by __kmp_register_root.
6219      Alternatively, we can use a counter of roots that is atomically updated by
6220      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6221      __kmp_internal_end_*.  */
6222 
6223   /* should we finish the run-time?  are all siblings done? */
6224   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6225 
6226   for (i = 0; i < __kmp_threads_capacity; ++i) {
6227     if (KMP_UBER_GTID(i)) {
6228       KA_TRACE(
6229           10,
6230           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6231       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6232       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6233       return;
6234     }
6235   }
6236 
6237   /* now we can safely conduct the actual termination */
6238 
6239   __kmp_internal_end();
6240 
6241   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6242   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6243 
6244   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6245 
6246 #ifdef DUMP_DEBUG_ON_EXIT
6247   if (__kmp_debug_buf)
6248     __kmp_dump_debug_buffer();
6249 #endif
6250 } // __kmp_internal_end_thread
6251 
6252 // -----------------------------------------------------------------------------
6253 // Library registration stuff.
6254 
6255 static long __kmp_registration_flag = 0;
6256 // Random value used to indicate library initialization.
6257 static char *__kmp_registration_str = NULL;
6258 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6259 
6260 static inline char *__kmp_reg_status_name() {
6261   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6262      each thread. If registration and unregistration go in different threads
6263      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6264      env var can not be found, because the name will contain different pid. */
6265   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6266 } // __kmp_reg_status_get
6267 
6268 void __kmp_register_library_startup(void) {
6269 
6270   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6271   int done = 0;
6272   union {
6273     double dtime;
6274     long ltime;
6275   } time;
6276 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6277   __kmp_initialize_system_tick();
6278 #endif
6279   __kmp_read_system_time(&time.dtime);
6280   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6281   __kmp_registration_str =
6282       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6283                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6284 
6285   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6286                 __kmp_registration_str));
6287 
6288   while (!done) {
6289 
6290     char *value = NULL; // Actual value of the environment variable.
6291 
6292     // Set environment variable, but do not overwrite if it is exist.
6293     __kmp_env_set(name, __kmp_registration_str, 0);
6294     // Check the variable is written.
6295     value = __kmp_env_get(name);
6296     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6297 
6298       done = 1; // Ok, environment variable set successfully, exit the loop.
6299 
6300     } else {
6301 
6302       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6303       // Check whether it alive or dead.
6304       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6305       char *tail = value;
6306       char *flag_addr_str = NULL;
6307       char *flag_val_str = NULL;
6308       char const *file_name = NULL;
6309       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6310       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6311       file_name = tail;
6312       if (tail != NULL) {
6313         long *flag_addr = 0;
6314         long flag_val = 0;
6315         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6316         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6317         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6318           // First, check whether environment-encoded address is mapped into
6319           // addr space.
6320           // If so, dereference it to see if it still has the right value.
6321           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6322             neighbor = 1;
6323           } else {
6324             // If not, then we know the other copy of the library is no longer
6325             // running.
6326             neighbor = 2;
6327           }
6328         }
6329       }
6330       switch (neighbor) {
6331       case 0: // Cannot parse environment variable -- neighbor status unknown.
6332         // Assume it is the incompatible format of future version of the
6333         // library. Assume the other library is alive.
6334         // WARN( ... ); // TODO: Issue a warning.
6335         file_name = "unknown library";
6336       // Attention! Falling to the next case. That's intentional.
6337       case 1: { // Neighbor is alive.
6338         // Check it is allowed.
6339         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6340         if (!__kmp_str_match_true(duplicate_ok)) {
6341           // That's not allowed. Issue fatal error.
6342           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6343                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6344         }
6345         KMP_INTERNAL_FREE(duplicate_ok);
6346         __kmp_duplicate_library_ok = 1;
6347         done = 1; // Exit the loop.
6348       } break;
6349       case 2: { // Neighbor is dead.
6350         // Clear the variable and try to register library again.
6351         __kmp_env_unset(name);
6352       } break;
6353       default: { KMP_DEBUG_ASSERT(0); } break;
6354       }
6355     }
6356     KMP_INTERNAL_FREE((void *)value);
6357   }
6358   KMP_INTERNAL_FREE((void *)name);
6359 
6360 } // func __kmp_register_library_startup
6361 
6362 void __kmp_unregister_library(void) {
6363 
6364   char *name = __kmp_reg_status_name();
6365   char *value = __kmp_env_get(name);
6366 
6367   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6368   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6369   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6370     // Ok, this is our variable. Delete it.
6371     __kmp_env_unset(name);
6372   }
6373 
6374   KMP_INTERNAL_FREE(__kmp_registration_str);
6375   KMP_INTERNAL_FREE(value);
6376   KMP_INTERNAL_FREE(name);
6377 
6378   __kmp_registration_flag = 0;
6379   __kmp_registration_str = NULL;
6380 
6381 } // __kmp_unregister_library
6382 
6383 // End of Library registration stuff.
6384 // -----------------------------------------------------------------------------
6385 
6386 #if KMP_MIC_SUPPORTED
6387 
6388 static void __kmp_check_mic_type() {
6389   kmp_cpuid_t cpuid_state = {0};
6390   kmp_cpuid_t *cs_p = &cpuid_state;
6391   __kmp_x86_cpuid(1, 0, cs_p);
6392   // We don't support mic1 at the moment
6393   if ((cs_p->eax & 0xff0) == 0xB10) {
6394     __kmp_mic_type = mic2;
6395   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6396     __kmp_mic_type = mic3;
6397   } else {
6398     __kmp_mic_type = non_mic;
6399   }
6400 }
6401 
6402 #endif /* KMP_MIC_SUPPORTED */
6403 
6404 static void __kmp_do_serial_initialize(void) {
6405   int i, gtid;
6406   int size;
6407 
6408   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6409 
6410   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6411   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6412   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6413   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6414   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6415 
6416 #if OMPT_SUPPORT
6417   ompt_pre_init();
6418 #endif
6419 
6420   __kmp_validate_locks();
6421 
6422   /* Initialize internal memory allocator */
6423   __kmp_init_allocator();
6424 
6425   /* Register the library startup via an environment variable and check to see
6426      whether another copy of the library is already registered. */
6427 
6428   __kmp_register_library_startup();
6429 
6430   /* TODO reinitialization of library */
6431   if (TCR_4(__kmp_global.g.g_done)) {
6432     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6433   }
6434 
6435   __kmp_global.g.g_abort = 0;
6436   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6437 
6438 /* initialize the locks */
6439 #if KMP_USE_ADAPTIVE_LOCKS
6440 #if KMP_DEBUG_ADAPTIVE_LOCKS
6441   __kmp_init_speculative_stats();
6442 #endif
6443 #endif
6444 #if KMP_STATS_ENABLED
6445   __kmp_stats_init();
6446 #endif
6447   __kmp_init_lock(&__kmp_global_lock);
6448   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6449   __kmp_init_lock(&__kmp_debug_lock);
6450   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6451   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6452   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6453   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6454   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6455   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6456   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6457   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6458   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6459   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6460   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6461   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6462   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6463   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6464   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6465 #if KMP_USE_MONITOR
6466   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6467 #endif
6468   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6469 
6470   /* conduct initialization and initial setup of configuration */
6471 
6472   __kmp_runtime_initialize();
6473 
6474 #if KMP_MIC_SUPPORTED
6475   __kmp_check_mic_type();
6476 #endif
6477 
6478 // Some global variable initialization moved here from kmp_env_initialize()
6479 #ifdef KMP_DEBUG
6480   kmp_diag = 0;
6481 #endif
6482   __kmp_abort_delay = 0;
6483 
6484   // From __kmp_init_dflt_team_nth()
6485   /* assume the entire machine will be used */
6486   __kmp_dflt_team_nth_ub = __kmp_xproc;
6487   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6488     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6489   }
6490   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6491     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6492   }
6493   __kmp_max_nth = __kmp_sys_max_nth;
6494   __kmp_cg_max_nth = __kmp_sys_max_nth;
6495   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6496   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6497     __kmp_teams_max_nth = __kmp_sys_max_nth;
6498   }
6499 
6500   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6501   // part
6502   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6503 #if KMP_USE_MONITOR
6504   __kmp_monitor_wakeups =
6505       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6506   __kmp_bt_intervals =
6507       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6508 #endif
6509   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6510   __kmp_library = library_throughput;
6511   // From KMP_SCHEDULE initialization
6512   __kmp_static = kmp_sch_static_balanced;
6513 // AC: do not use analytical here, because it is non-monotonous
6514 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6515 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6516 // need to repeat assignment
6517 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6518 // bit control and barrier method control parts
6519 #if KMP_FAST_REDUCTION_BARRIER
6520 #define kmp_reduction_barrier_gather_bb ((int)1)
6521 #define kmp_reduction_barrier_release_bb ((int)1)
6522 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6523 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6524 #endif // KMP_FAST_REDUCTION_BARRIER
6525   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6526     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6527     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6528     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6529     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6530 #if KMP_FAST_REDUCTION_BARRIER
6531     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6532       // lin_64 ): hyper,1
6533       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6534       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6535       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6536       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6537     }
6538 #endif // KMP_FAST_REDUCTION_BARRIER
6539   }
6540 #if KMP_FAST_REDUCTION_BARRIER
6541 #undef kmp_reduction_barrier_release_pat
6542 #undef kmp_reduction_barrier_gather_pat
6543 #undef kmp_reduction_barrier_release_bb
6544 #undef kmp_reduction_barrier_gather_bb
6545 #endif // KMP_FAST_REDUCTION_BARRIER
6546 #if KMP_MIC_SUPPORTED
6547   if (__kmp_mic_type == mic2) { // KNC
6548     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6549     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6550     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6551         1; // forkjoin release
6552     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6553     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6554   }
6555 #if KMP_FAST_REDUCTION_BARRIER
6556   if (__kmp_mic_type == mic2) { // KNC
6557     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6558     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6559   }
6560 #endif // KMP_FAST_REDUCTION_BARRIER
6561 #endif // KMP_MIC_SUPPORTED
6562 
6563 // From KMP_CHECKS initialization
6564 #ifdef KMP_DEBUG
6565   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6566 #else
6567   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6568 #endif
6569 
6570   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6571   __kmp_foreign_tp = TRUE;
6572 
6573   __kmp_global.g.g_dynamic = FALSE;
6574   __kmp_global.g.g_dynamic_mode = dynamic_default;
6575 
6576   __kmp_env_initialize(NULL);
6577 
6578 // Print all messages in message catalog for testing purposes.
6579 #ifdef KMP_DEBUG
6580   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6581   if (__kmp_str_match_true(val)) {
6582     kmp_str_buf_t buffer;
6583     __kmp_str_buf_init(&buffer);
6584     __kmp_i18n_dump_catalog(&buffer);
6585     __kmp_printf("%s", buffer.str);
6586     __kmp_str_buf_free(&buffer);
6587   }
6588   __kmp_env_free(&val);
6589 #endif
6590 
6591   __kmp_threads_capacity =
6592       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6593   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6594   __kmp_tp_capacity = __kmp_default_tp_capacity(
6595       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6596 
6597   // If the library is shut down properly, both pools must be NULL. Just in
6598   // case, set them to NULL -- some memory may leak, but subsequent code will
6599   // work even if pools are not freed.
6600   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6601   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6602   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6603   __kmp_thread_pool = NULL;
6604   __kmp_thread_pool_insert_pt = NULL;
6605   __kmp_team_pool = NULL;
6606 
6607   /* Allocate all of the variable sized records */
6608   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6609    * expandable */
6610   /* Since allocation is cache-aligned, just add extra padding at the end */
6611   size =
6612       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6613       CACHE_LINE;
6614   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6615   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6616                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6617 
6618   /* init thread counts */
6619   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6620                    0); // Asserts fail if the library is reinitializing and
6621   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6622   __kmp_all_nth = 0;
6623   __kmp_nth = 0;
6624 
6625   /* setup the uber master thread and hierarchy */
6626   gtid = __kmp_register_root(TRUE);
6627   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6628   KMP_ASSERT(KMP_UBER_GTID(gtid));
6629   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6630 
6631   KMP_MB(); /* Flush all pending memory write invalidates.  */
6632 
6633   __kmp_common_initialize();
6634 
6635 #if KMP_OS_UNIX
6636   /* invoke the child fork handler */
6637   __kmp_register_atfork();
6638 #endif
6639 
6640 #if !defined KMP_DYNAMIC_LIB
6641   {
6642     /* Invoke the exit handler when the program finishes, only for static
6643        library. For dynamic library, we already have _fini and DllMain. */
6644     int rc = atexit(__kmp_internal_end_atexit);
6645     if (rc != 0) {
6646       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6647                   __kmp_msg_null);
6648     }
6649   }
6650 #endif
6651 
6652 #if KMP_HANDLE_SIGNALS
6653 #if KMP_OS_UNIX
6654   /* NOTE: make sure that this is called before the user installs their own
6655      signal handlers so that the user handlers are called first. this way they
6656      can return false, not call our handler, avoid terminating the library, and
6657      continue execution where they left off. */
6658   __kmp_install_signals(FALSE);
6659 #endif /* KMP_OS_UNIX */
6660 #if KMP_OS_WINDOWS
6661   __kmp_install_signals(TRUE);
6662 #endif /* KMP_OS_WINDOWS */
6663 #endif
6664 
6665   /* we have finished the serial initialization */
6666   __kmp_init_counter++;
6667 
6668   __kmp_init_serial = TRUE;
6669 
6670   if (__kmp_settings) {
6671     __kmp_env_print();
6672   }
6673 
6674 #if OMP_40_ENABLED
6675   if (__kmp_display_env || __kmp_display_env_verbose) {
6676     __kmp_env_print_2();
6677   }
6678 #endif // OMP_40_ENABLED
6679 
6680 #if OMPT_SUPPORT
6681   ompt_post_init();
6682 #endif
6683 
6684   KMP_MB();
6685 
6686   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6687 }
6688 
6689 void __kmp_serial_initialize(void) {
6690   if (__kmp_init_serial) {
6691     return;
6692   }
6693   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6694   if (__kmp_init_serial) {
6695     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6696     return;
6697   }
6698   __kmp_do_serial_initialize();
6699   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6700 }
6701 
6702 static void __kmp_do_middle_initialize(void) {
6703   int i, j;
6704   int prev_dflt_team_nth;
6705 
6706   if (!__kmp_init_serial) {
6707     __kmp_do_serial_initialize();
6708   }
6709 
6710   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6711 
6712   // Save the previous value for the __kmp_dflt_team_nth so that
6713   // we can avoid some reinitialization if it hasn't changed.
6714   prev_dflt_team_nth = __kmp_dflt_team_nth;
6715 
6716 #if KMP_AFFINITY_SUPPORTED
6717   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6718   // number of cores on the machine.
6719   __kmp_affinity_initialize();
6720 
6721   // Run through the __kmp_threads array and set the affinity mask
6722   // for each root thread that is currently registered with the RTL.
6723   for (i = 0; i < __kmp_threads_capacity; i++) {
6724     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6725       __kmp_affinity_set_init_mask(i, TRUE);
6726     }
6727   }
6728 #endif /* KMP_AFFINITY_SUPPORTED */
6729 
6730   KMP_ASSERT(__kmp_xproc > 0);
6731   if (__kmp_avail_proc == 0) {
6732     __kmp_avail_proc = __kmp_xproc;
6733   }
6734 
6735   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6736   // correct them now
6737   j = 0;
6738   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6739     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6740         __kmp_avail_proc;
6741     j++;
6742   }
6743 
6744   if (__kmp_dflt_team_nth == 0) {
6745 #ifdef KMP_DFLT_NTH_CORES
6746     // Default #threads = #cores
6747     __kmp_dflt_team_nth = __kmp_ncores;
6748     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6749                   "__kmp_ncores (%d)\n",
6750                   __kmp_dflt_team_nth));
6751 #else
6752     // Default #threads = #available OS procs
6753     __kmp_dflt_team_nth = __kmp_avail_proc;
6754     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6755                   "__kmp_avail_proc(%d)\n",
6756                   __kmp_dflt_team_nth));
6757 #endif /* KMP_DFLT_NTH_CORES */
6758   }
6759 
6760   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6761     __kmp_dflt_team_nth = KMP_MIN_NTH;
6762   }
6763   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6764     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6765   }
6766 
6767   // There's no harm in continuing if the following check fails,
6768   // but it indicates an error in the previous logic.
6769   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6770 
6771   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6772     // Run through the __kmp_threads array and set the num threads icv for each
6773     // root thread that is currently registered with the RTL (which has not
6774     // already explicitly set its nthreads-var with a call to
6775     // omp_set_num_threads()).
6776     for (i = 0; i < __kmp_threads_capacity; i++) {
6777       kmp_info_t *thread = __kmp_threads[i];
6778       if (thread == NULL)
6779         continue;
6780       if (thread->th.th_current_task->td_icvs.nproc != 0)
6781         continue;
6782 
6783       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6784     }
6785   }
6786   KA_TRACE(
6787       20,
6788       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6789        __kmp_dflt_team_nth));
6790 
6791 #ifdef KMP_ADJUST_BLOCKTIME
6792   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6793   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6794     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6795     if (__kmp_nth > __kmp_avail_proc) {
6796       __kmp_zero_bt = TRUE;
6797     }
6798   }
6799 #endif /* KMP_ADJUST_BLOCKTIME */
6800 
6801   /* we have finished middle initialization */
6802   TCW_SYNC_4(__kmp_init_middle, TRUE);
6803 
6804   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6805 }
6806 
6807 void __kmp_middle_initialize(void) {
6808   if (__kmp_init_middle) {
6809     return;
6810   }
6811   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6812   if (__kmp_init_middle) {
6813     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6814     return;
6815   }
6816   __kmp_do_middle_initialize();
6817   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6818 }
6819 
6820 void __kmp_parallel_initialize(void) {
6821   int gtid = __kmp_entry_gtid(); // this might be a new root
6822 
6823   /* synchronize parallel initialization (for sibling) */
6824   if (TCR_4(__kmp_init_parallel))
6825     return;
6826   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6827   if (TCR_4(__kmp_init_parallel)) {
6828     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6829     return;
6830   }
6831 
6832   /* TODO reinitialization after we have already shut down */
6833   if (TCR_4(__kmp_global.g.g_done)) {
6834     KA_TRACE(
6835         10,
6836         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6837     __kmp_infinite_loop();
6838   }
6839 
6840   /* jc: The lock __kmp_initz_lock is already held, so calling
6841      __kmp_serial_initialize would cause a deadlock.  So we call
6842      __kmp_do_serial_initialize directly. */
6843   if (!__kmp_init_middle) {
6844     __kmp_do_middle_initialize();
6845   }
6846 
6847   /* begin initialization */
6848   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6849   KMP_ASSERT(KMP_UBER_GTID(gtid));
6850 
6851 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6852   // Save the FP control regs.
6853   // Worker threads will set theirs to these values at thread startup.
6854   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6855   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6856   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6857 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6858 
6859 #if KMP_OS_UNIX
6860 #if KMP_HANDLE_SIGNALS
6861   /*  must be after __kmp_serial_initialize  */
6862   __kmp_install_signals(TRUE);
6863 #endif
6864 #endif
6865 
6866   __kmp_suspend_initialize();
6867 
6868 #if defined(USE_LOAD_BALANCE)
6869   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6870     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6871   }
6872 #else
6873   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6874     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6875   }
6876 #endif
6877 
6878   if (__kmp_version) {
6879     __kmp_print_version_2();
6880   }
6881 
6882   /* we have finished parallel initialization */
6883   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6884 
6885   KMP_MB();
6886   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6887 
6888   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6889 }
6890 
6891 /* ------------------------------------------------------------------------ */
6892 
6893 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6894                                    kmp_team_t *team) {
6895   kmp_disp_t *dispatch;
6896 
6897   KMP_MB();
6898 
6899   /* none of the threads have encountered any constructs, yet. */
6900   this_thr->th.th_local.this_construct = 0;
6901 #if KMP_CACHE_MANAGE
6902   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6903 #endif /* KMP_CACHE_MANAGE */
6904   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6905   KMP_DEBUG_ASSERT(dispatch);
6906   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6907   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6908   // this_thr->th.th_info.ds.ds_tid ] );
6909 
6910   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6911 #if OMP_45_ENABLED
6912   dispatch->th_doacross_buf_idx =
6913       0; /* reset the doacross dispatch buffer counter */
6914 #endif
6915   if (__kmp_env_consistency_check)
6916     __kmp_push_parallel(gtid, team->t.t_ident);
6917 
6918   KMP_MB(); /* Flush all pending memory write invalidates.  */
6919 }
6920 
6921 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6922                                   kmp_team_t *team) {
6923   if (__kmp_env_consistency_check)
6924     __kmp_pop_parallel(gtid, team->t.t_ident);
6925 
6926   __kmp_finish_implicit_task(this_thr);
6927 }
6928 
6929 int __kmp_invoke_task_func(int gtid) {
6930   int rc;
6931   int tid = __kmp_tid_from_gtid(gtid);
6932   kmp_info_t *this_thr = __kmp_threads[gtid];
6933   kmp_team_t *team = this_thr->th.th_team;
6934 
6935   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6936 #if USE_ITT_BUILD
6937   if (__itt_stack_caller_create_ptr) {
6938     __kmp_itt_stack_callee_enter(
6939         (__itt_caller)
6940             team->t.t_stack_id); // inform ittnotify about entering user's code
6941   }
6942 #endif /* USE_ITT_BUILD */
6943 #if INCLUDE_SSC_MARKS
6944   SSC_MARK_INVOKING();
6945 #endif
6946 
6947 #if OMPT_SUPPORT
6948   void *dummy;
6949   void **exit_runtime_p;
6950   ompt_data_t *my_task_data;
6951   ompt_data_t *my_parallel_data;
6952   int ompt_team_size;
6953 
6954   if (ompt_enabled.enabled) {
6955     exit_runtime_p = &(
6956         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6957   } else {
6958     exit_runtime_p = &dummy;
6959   }
6960 
6961   my_task_data =
6962       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6963   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6964   if (ompt_enabled.ompt_callback_implicit_task) {
6965     ompt_team_size = team->t.t_nproc;
6966     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6967         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6968         __kmp_tid_from_gtid(gtid));
6969   }
6970 #endif
6971 
6972   {
6973     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6974     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6975     rc =
6976         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6977                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6978 #if OMPT_SUPPORT
6979                                ,
6980                                exit_runtime_p
6981 #endif
6982                                );
6983 #if OMPT_SUPPORT
6984     *exit_runtime_p = NULL;
6985 #endif
6986   }
6987 
6988 #if USE_ITT_BUILD
6989   if (__itt_stack_caller_create_ptr) {
6990     __kmp_itt_stack_callee_leave(
6991         (__itt_caller)
6992             team->t.t_stack_id); // inform ittnotify about leaving user's code
6993   }
6994 #endif /* USE_ITT_BUILD */
6995   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6996 
6997   return rc;
6998 }
6999 
7000 #if OMP_40_ENABLED
7001 void __kmp_teams_master(int gtid) {
7002   // This routine is called by all master threads in teams construct
7003   kmp_info_t *thr = __kmp_threads[gtid];
7004   kmp_team_t *team = thr->th.th_team;
7005   ident_t *loc = team->t.t_ident;
7006   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7007   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7008   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7009   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7010                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7011 // Launch league of teams now, but not let workers execute
7012 // (they hang on fork barrier until next parallel)
7013 #if INCLUDE_SSC_MARKS
7014   SSC_MARK_FORKING();
7015 #endif
7016   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7017                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7018                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7019 #if INCLUDE_SSC_MARKS
7020   SSC_MARK_JOINING();
7021 #endif
7022 
7023   // AC: last parameter "1" eliminates join barrier which won't work because
7024   // worker threads are in a fork barrier waiting for more parallel regions
7025   __kmp_join_call(loc, gtid
7026 #if OMPT_SUPPORT
7027                   ,
7028                   fork_context_intel
7029 #endif
7030                   ,
7031                   1);
7032 }
7033 
7034 int __kmp_invoke_teams_master(int gtid) {
7035   kmp_info_t *this_thr = __kmp_threads[gtid];
7036   kmp_team_t *team = this_thr->th.th_team;
7037 #if KMP_DEBUG
7038   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7039     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7040                      (void *)__kmp_teams_master);
7041 #endif
7042   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7043   __kmp_teams_master(gtid);
7044   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7045   return 1;
7046 }
7047 #endif /* OMP_40_ENABLED */
7048 
7049 /* this sets the requested number of threads for the next parallel region
7050    encountered by this team. since this should be enclosed in the forkjoin
7051    critical section it should avoid race conditions with assymmetrical nested
7052    parallelism */
7053 
7054 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7055   kmp_info_t *thr = __kmp_threads[gtid];
7056 
7057   if (num_threads > 0)
7058     thr->th.th_set_nproc = num_threads;
7059 }
7060 
7061 #if OMP_40_ENABLED
7062 
7063 /* this sets the requested number of teams for the teams region and/or
7064    the number of threads for the next parallel region encountered  */
7065 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7066                           int num_threads) {
7067   kmp_info_t *thr = __kmp_threads[gtid];
7068   KMP_DEBUG_ASSERT(num_teams >= 0);
7069   KMP_DEBUG_ASSERT(num_threads >= 0);
7070 
7071   if (num_teams == 0)
7072     num_teams = 1; // default number of teams is 1.
7073   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7074     if (!__kmp_reserve_warn) {
7075       __kmp_reserve_warn = 1;
7076       __kmp_msg(kmp_ms_warning,
7077                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7078                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7079     }
7080     num_teams = __kmp_teams_max_nth;
7081   }
7082   // Set number of teams (number of threads in the outer "parallel" of the
7083   // teams)
7084   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7085 
7086   // Remember the number of threads for inner parallel regions
7087   if (num_threads == 0) {
7088     if (!TCR_4(__kmp_init_middle))
7089       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7090     num_threads = __kmp_avail_proc / num_teams;
7091     if (num_teams * num_threads > __kmp_teams_max_nth) {
7092       // adjust num_threads w/o warning as it is not user setting
7093       num_threads = __kmp_teams_max_nth / num_teams;
7094     }
7095   } else {
7096     if (num_teams * num_threads > __kmp_teams_max_nth) {
7097       int new_threads = __kmp_teams_max_nth / num_teams;
7098       if (!__kmp_reserve_warn) { // user asked for too many threads
7099         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7100         __kmp_msg(kmp_ms_warning,
7101                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7102                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7103       }
7104       num_threads = new_threads;
7105     }
7106   }
7107   thr->th.th_teams_size.nth = num_threads;
7108 }
7109 
7110 // Set the proc_bind var to use in the following parallel region.
7111 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7112   kmp_info_t *thr = __kmp_threads[gtid];
7113   thr->th.th_set_proc_bind = proc_bind;
7114 }
7115 
7116 #endif /* OMP_40_ENABLED */
7117 
7118 /* Launch the worker threads into the microtask. */
7119 
7120 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7121   kmp_info_t *this_thr = __kmp_threads[gtid];
7122 
7123 #ifdef KMP_DEBUG
7124   int f;
7125 #endif /* KMP_DEBUG */
7126 
7127   KMP_DEBUG_ASSERT(team);
7128   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7129   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7130   KMP_MB(); /* Flush all pending memory write invalidates.  */
7131 
7132   team->t.t_construct = 0; /* no single directives seen yet */
7133   team->t.t_ordered.dt.t_value =
7134       0; /* thread 0 enters the ordered section first */
7135 
7136   /* Reset the identifiers on the dispatch buffer */
7137   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7138   if (team->t.t_max_nproc > 1) {
7139     int i;
7140     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7141       team->t.t_disp_buffer[i].buffer_index = i;
7142 #if OMP_45_ENABLED
7143       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7144 #endif
7145     }
7146   } else {
7147     team->t.t_disp_buffer[0].buffer_index = 0;
7148 #if OMP_45_ENABLED
7149     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7150 #endif
7151   }
7152 
7153   KMP_MB(); /* Flush all pending memory write invalidates.  */
7154   KMP_ASSERT(this_thr->th.th_team == team);
7155 
7156 #ifdef KMP_DEBUG
7157   for (f = 0; f < team->t.t_nproc; f++) {
7158     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7159                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7160   }
7161 #endif /* KMP_DEBUG */
7162 
7163   /* release the worker threads so they may begin working */
7164   __kmp_fork_barrier(gtid, 0);
7165 }
7166 
7167 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7168   kmp_info_t *this_thr = __kmp_threads[gtid];
7169 
7170   KMP_DEBUG_ASSERT(team);
7171   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7172   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7173   KMP_MB(); /* Flush all pending memory write invalidates.  */
7174 
7175 /* Join barrier after fork */
7176 
7177 #ifdef KMP_DEBUG
7178   if (__kmp_threads[gtid] &&
7179       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7180     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7181                  __kmp_threads[gtid]);
7182     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7183                  "team->t.t_nproc=%d\n",
7184                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7185                  team->t.t_nproc);
7186     __kmp_print_structure();
7187   }
7188   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7189                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7190 #endif /* KMP_DEBUG */
7191 
7192   __kmp_join_barrier(gtid); /* wait for everyone */
7193 #if OMPT_SUPPORT
7194   int ds_tid = this_thr->th.th_info.ds.ds_tid;
7195   if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7196     ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr);
7197     ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr);
7198     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7199 #if OMPT_OPTIONAL
7200     void *codeptr = NULL;
7201     if (KMP_MASTER_TID(ds_tid) &&
7202         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7203          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7204       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7205 
7206     if (ompt_enabled.ompt_callback_sync_region_wait) {
7207       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7208           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7209     }
7210     if (ompt_enabled.ompt_callback_sync_region) {
7211       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7212           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7213     }
7214 #endif
7215     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7216       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7217           ompt_scope_end, NULL, tId, 0, ds_tid);
7218     }
7219     // return to idle state
7220     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7221   }
7222 #endif
7223 
7224   KMP_MB(); /* Flush all pending memory write invalidates.  */
7225   KMP_ASSERT(this_thr->th.th_team == team);
7226 }
7227 
7228 /* ------------------------------------------------------------------------ */
7229 
7230 #ifdef USE_LOAD_BALANCE
7231 
7232 // Return the worker threads actively spinning in the hot team, if we
7233 // are at the outermost level of parallelism.  Otherwise, return 0.
7234 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7235   int i;
7236   int retval;
7237   kmp_team_t *hot_team;
7238 
7239   if (root->r.r_active) {
7240     return 0;
7241   }
7242   hot_team = root->r.r_hot_team;
7243   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7244     return hot_team->t.t_nproc - 1; // Don't count master thread
7245   }
7246 
7247   // Skip the master thread - it is accounted for elsewhere.
7248   retval = 0;
7249   for (i = 1; i < hot_team->t.t_nproc; i++) {
7250     if (hot_team->t.t_threads[i]->th.th_active) {
7251       retval++;
7252     }
7253   }
7254   return retval;
7255 }
7256 
7257 // Perform an automatic adjustment to the number of
7258 // threads used by the next parallel region.
7259 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7260   int retval;
7261   int pool_active;
7262   int hot_team_active;
7263   int team_curr_active;
7264   int system_active;
7265 
7266   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7267                 set_nproc));
7268   KMP_DEBUG_ASSERT(root);
7269   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7270                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7271   KMP_DEBUG_ASSERT(set_nproc > 1);
7272 
7273   if (set_nproc == 1) {
7274     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7275     return 1;
7276   }
7277 
7278   // Threads that are active in the thread pool, active in the hot team for this
7279   // particular root (if we are at the outer par level), and the currently
7280   // executing thread (to become the master) are available to add to the new
7281   // team, but are currently contributing to the system load, and must be
7282   // accounted for.
7283   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7284   hot_team_active = __kmp_active_hot_team_nproc(root);
7285   team_curr_active = pool_active + hot_team_active + 1;
7286 
7287   // Check the system load.
7288   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7289   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7290                 "hot team active = %d\n",
7291                 system_active, pool_active, hot_team_active));
7292 
7293   if (system_active < 0) {
7294     // There was an error reading the necessary info from /proc, so use the
7295     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7296     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7297     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7298     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7299 
7300     // Make this call behave like the thread limit algorithm.
7301     retval = __kmp_avail_proc - __kmp_nth +
7302              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7303     if (retval > set_nproc) {
7304       retval = set_nproc;
7305     }
7306     if (retval < KMP_MIN_NTH) {
7307       retval = KMP_MIN_NTH;
7308     }
7309 
7310     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7311                   retval));
7312     return retval;
7313   }
7314 
7315   // There is a slight delay in the load balance algorithm in detecting new
7316   // running procs. The real system load at this instant should be at least as
7317   // large as the #active omp thread that are available to add to the team.
7318   if (system_active < team_curr_active) {
7319     system_active = team_curr_active;
7320   }
7321   retval = __kmp_avail_proc - system_active + team_curr_active;
7322   if (retval > set_nproc) {
7323     retval = set_nproc;
7324   }
7325   if (retval < KMP_MIN_NTH) {
7326     retval = KMP_MIN_NTH;
7327   }
7328 
7329   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7330   return retval;
7331 } // __kmp_load_balance_nproc()
7332 
7333 #endif /* USE_LOAD_BALANCE */
7334 
7335 /* ------------------------------------------------------------------------ */
7336 
7337 /* NOTE: this is called with the __kmp_init_lock held */
7338 void __kmp_cleanup(void) {
7339   int f;
7340 
7341   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7342 
7343   if (TCR_4(__kmp_init_parallel)) {
7344 #if KMP_HANDLE_SIGNALS
7345     __kmp_remove_signals();
7346 #endif
7347     TCW_4(__kmp_init_parallel, FALSE);
7348   }
7349 
7350   if (TCR_4(__kmp_init_middle)) {
7351 #if KMP_AFFINITY_SUPPORTED
7352     __kmp_affinity_uninitialize();
7353 #endif /* KMP_AFFINITY_SUPPORTED */
7354     __kmp_cleanup_hierarchy();
7355     TCW_4(__kmp_init_middle, FALSE);
7356   }
7357 
7358   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7359 
7360   if (__kmp_init_serial) {
7361     __kmp_runtime_destroy();
7362     __kmp_init_serial = FALSE;
7363   }
7364 
7365   for (f = 0; f < __kmp_threads_capacity; f++) {
7366     if (__kmp_root[f] != NULL) {
7367       __kmp_free(__kmp_root[f]);
7368       __kmp_root[f] = NULL;
7369     }
7370   }
7371   __kmp_free(__kmp_threads);
7372   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7373   // there is no need in freeing __kmp_root.
7374   __kmp_threads = NULL;
7375   __kmp_root = NULL;
7376   __kmp_threads_capacity = 0;
7377 
7378 #if KMP_USE_DYNAMIC_LOCK
7379   __kmp_cleanup_indirect_user_locks();
7380 #else
7381   __kmp_cleanup_user_locks();
7382 #endif
7383 
7384 #if KMP_AFFINITY_SUPPORTED
7385   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7386   __kmp_cpuinfo_file = NULL;
7387 #endif /* KMP_AFFINITY_SUPPORTED */
7388 
7389 #if KMP_USE_ADAPTIVE_LOCKS
7390 #if KMP_DEBUG_ADAPTIVE_LOCKS
7391   __kmp_print_speculative_stats();
7392 #endif
7393 #endif
7394   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7395   __kmp_nested_nth.nth = NULL;
7396   __kmp_nested_nth.size = 0;
7397   __kmp_nested_nth.used = 0;
7398   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7399   __kmp_nested_proc_bind.bind_types = NULL;
7400   __kmp_nested_proc_bind.size = 0;
7401   __kmp_nested_proc_bind.used = 0;
7402 
7403   __kmp_i18n_catclose();
7404 
7405 #if KMP_STATS_ENABLED
7406   __kmp_stats_fini();
7407 #endif
7408 
7409   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7410 }
7411 
7412 /* ------------------------------------------------------------------------ */
7413 
7414 int __kmp_ignore_mppbeg(void) {
7415   char *env;
7416 
7417   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7418     if (__kmp_str_match_false(env))
7419       return FALSE;
7420   }
7421   // By default __kmpc_begin() is no-op.
7422   return TRUE;
7423 }
7424 
7425 int __kmp_ignore_mppend(void) {
7426   char *env;
7427 
7428   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7429     if (__kmp_str_match_false(env))
7430       return FALSE;
7431   }
7432   // By default __kmpc_end() is no-op.
7433   return TRUE;
7434 }
7435 
7436 void __kmp_internal_begin(void) {
7437   int gtid;
7438   kmp_root_t *root;
7439 
7440   /* this is a very important step as it will register new sibling threads
7441      and assign these new uber threads a new gtid */
7442   gtid = __kmp_entry_gtid();
7443   root = __kmp_threads[gtid]->th.th_root;
7444   KMP_ASSERT(KMP_UBER_GTID(gtid));
7445 
7446   if (root->r.r_begin)
7447     return;
7448   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7449   if (root->r.r_begin) {
7450     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7451     return;
7452   }
7453 
7454   root->r.r_begin = TRUE;
7455 
7456   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7457 }
7458 
7459 /* ------------------------------------------------------------------------ */
7460 
7461 void __kmp_user_set_library(enum library_type arg) {
7462   int gtid;
7463   kmp_root_t *root;
7464   kmp_info_t *thread;
7465 
7466   /* first, make sure we are initialized so we can get our gtid */
7467 
7468   gtid = __kmp_entry_gtid();
7469   thread = __kmp_threads[gtid];
7470 
7471   root = thread->th.th_root;
7472 
7473   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7474                 library_serial));
7475   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7476                                   thread */
7477     KMP_WARNING(SetLibraryIncorrectCall);
7478     return;
7479   }
7480 
7481   switch (arg) {
7482   case library_serial:
7483     thread->th.th_set_nproc = 0;
7484     set__nproc(thread, 1);
7485     break;
7486   case library_turnaround:
7487     thread->th.th_set_nproc = 0;
7488     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7489                                            : __kmp_dflt_team_nth_ub);
7490     break;
7491   case library_throughput:
7492     thread->th.th_set_nproc = 0;
7493     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7494                                            : __kmp_dflt_team_nth_ub);
7495     break;
7496   default:
7497     KMP_FATAL(UnknownLibraryType, arg);
7498   }
7499 
7500   __kmp_aux_set_library(arg);
7501 }
7502 
7503 void __kmp_aux_set_stacksize(size_t arg) {
7504   if (!__kmp_init_serial)
7505     __kmp_serial_initialize();
7506 
7507 #if KMP_OS_DARWIN
7508   if (arg & (0x1000 - 1)) {
7509     arg &= ~(0x1000 - 1);
7510     if (arg + 0x1000) /* check for overflow if we round up */
7511       arg += 0x1000;
7512   }
7513 #endif
7514   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7515 
7516   /* only change the default stacksize before the first parallel region */
7517   if (!TCR_4(__kmp_init_parallel)) {
7518     size_t value = arg; /* argument is in bytes */
7519 
7520     if (value < __kmp_sys_min_stksize)
7521       value = __kmp_sys_min_stksize;
7522     else if (value > KMP_MAX_STKSIZE)
7523       value = KMP_MAX_STKSIZE;
7524 
7525     __kmp_stksize = value;
7526 
7527     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7528   }
7529 
7530   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7531 }
7532 
7533 /* set the behaviour of the runtime library */
7534 /* TODO this can cause some odd behaviour with sibling parallelism... */
7535 void __kmp_aux_set_library(enum library_type arg) {
7536   __kmp_library = arg;
7537 
7538   switch (__kmp_library) {
7539   case library_serial: {
7540     KMP_INFORM(LibraryIsSerial);
7541     (void)__kmp_change_library(TRUE);
7542   } break;
7543   case library_turnaround:
7544     (void)__kmp_change_library(TRUE);
7545     break;
7546   case library_throughput:
7547     (void)__kmp_change_library(FALSE);
7548     break;
7549   default:
7550     KMP_FATAL(UnknownLibraryType, arg);
7551   }
7552 }
7553 
7554 /* ------------------------------------------------------------------------ */
7555 
7556 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7557   int blocktime = arg; /* argument is in milliseconds */
7558 #if KMP_USE_MONITOR
7559   int bt_intervals;
7560 #endif
7561   int bt_set;
7562 
7563   __kmp_save_internal_controls(thread);
7564 
7565   /* Normalize and set blocktime for the teams */
7566   if (blocktime < KMP_MIN_BLOCKTIME)
7567     blocktime = KMP_MIN_BLOCKTIME;
7568   else if (blocktime > KMP_MAX_BLOCKTIME)
7569     blocktime = KMP_MAX_BLOCKTIME;
7570 
7571   set__blocktime_team(thread->th.th_team, tid, blocktime);
7572   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7573 
7574 #if KMP_USE_MONITOR
7575   /* Calculate and set blocktime intervals for the teams */
7576   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7577 
7578   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7579   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7580 #endif
7581 
7582   /* Set whether blocktime has been set to "TRUE" */
7583   bt_set = TRUE;
7584 
7585   set__bt_set_team(thread->th.th_team, tid, bt_set);
7586   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7587 #if KMP_USE_MONITOR
7588   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7589                 "bt_intervals=%d, monitor_updates=%d\n",
7590                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7591                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7592                 __kmp_monitor_wakeups));
7593 #else
7594   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7595                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7596                 thread->th.th_team->t.t_id, tid, blocktime));
7597 #endif
7598 }
7599 
7600 void __kmp_aux_set_defaults(char const *str, int len) {
7601   if (!__kmp_init_serial) {
7602     __kmp_serial_initialize();
7603   }
7604   __kmp_env_initialize(str);
7605 
7606   if (__kmp_settings
7607 #if OMP_40_ENABLED
7608       || __kmp_display_env || __kmp_display_env_verbose
7609 #endif // OMP_40_ENABLED
7610       ) {
7611     __kmp_env_print();
7612   }
7613 } // __kmp_aux_set_defaults
7614 
7615 /* ------------------------------------------------------------------------ */
7616 /* internal fast reduction routines */
7617 
7618 PACKED_REDUCTION_METHOD_T
7619 __kmp_determine_reduction_method(
7620     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7621     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7622     kmp_critical_name *lck) {
7623 
7624   // Default reduction method: critical construct ( lck != NULL, like in current
7625   // PAROPT )
7626   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7627   // can be selected by RTL
7628   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7629   // can be selected by RTL
7630   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7631   // among generated by PAROPT.
7632 
7633   PACKED_REDUCTION_METHOD_T retval;
7634 
7635   int team_size;
7636 
7637   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7638   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7639 
7640 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7641   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7642 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7643 
7644   retval = critical_reduce_block;
7645 
7646   // another choice of getting a team size (with 1 dynamic deference) is slower
7647   team_size = __kmp_get_team_num_threads(global_tid);
7648   if (team_size == 1) {
7649 
7650     retval = empty_reduce_block;
7651 
7652   } else {
7653 
7654     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7655     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7656 
7657 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7658 
7659 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7660     KMP_OS_DARWIN
7661 
7662     int teamsize_cutoff = 4;
7663 
7664 #if KMP_MIC_SUPPORTED
7665     if (__kmp_mic_type != non_mic) {
7666       teamsize_cutoff = 8;
7667     }
7668 #endif
7669     if (tree_available) {
7670       if (team_size <= teamsize_cutoff) {
7671         if (atomic_available) {
7672           retval = atomic_reduce_block;
7673         }
7674       } else {
7675         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7676       }
7677     } else if (atomic_available) {
7678       retval = atomic_reduce_block;
7679     }
7680 #else
7681 #error "Unknown or unsupported OS"
7682 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7683 // KMP_OS_DARWIN
7684 
7685 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7686 
7687 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7688 
7689     // basic tuning
7690 
7691     if (atomic_available) {
7692       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7693         retval = atomic_reduce_block;
7694       }
7695     } // otherwise: use critical section
7696 
7697 #elif KMP_OS_DARWIN
7698 
7699     if (atomic_available && (num_vars <= 3)) {
7700       retval = atomic_reduce_block;
7701     } else if (tree_available) {
7702       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7703           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7704         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7705       }
7706     } // otherwise: use critical section
7707 
7708 #else
7709 #error "Unknown or unsupported OS"
7710 #endif
7711 
7712 #else
7713 #error "Unknown or unsupported architecture"
7714 #endif
7715   }
7716 
7717   // KMP_FORCE_REDUCTION
7718 
7719   // If the team is serialized (team_size == 1), ignore the forced reduction
7720   // method and stay with the unsynchronized method (empty_reduce_block)
7721   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7722       team_size != 1) {
7723 
7724     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7725 
7726     int atomic_available, tree_available;
7727 
7728     switch ((forced_retval = __kmp_force_reduction_method)) {
7729     case critical_reduce_block:
7730       KMP_ASSERT(lck); // lck should be != 0
7731       break;
7732 
7733     case atomic_reduce_block:
7734       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7735       if (!atomic_available) {
7736         KMP_WARNING(RedMethodNotSupported, "atomic");
7737         forced_retval = critical_reduce_block;
7738       }
7739       break;
7740 
7741     case tree_reduce_block:
7742       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7743       if (!tree_available) {
7744         KMP_WARNING(RedMethodNotSupported, "tree");
7745         forced_retval = critical_reduce_block;
7746       } else {
7747 #if KMP_FAST_REDUCTION_BARRIER
7748         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7749 #endif
7750       }
7751       break;
7752 
7753     default:
7754       KMP_ASSERT(0); // "unsupported method specified"
7755     }
7756 
7757     retval = forced_retval;
7758   }
7759 
7760   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7761 
7762 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7763 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7764 
7765   return (retval);
7766 }
7767 
7768 // this function is for testing set/get/determine reduce method
7769 kmp_int32 __kmp_get_reduce_method(void) {
7770   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7771 }
7772