1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nWish, int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)
1375         ->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
1376 
1377     ompt_lw_taskteam_t lw_taskteam;
1378     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1379                             &ompt_parallel_data, codeptr);
1380 
1381     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1382     // don't use lw_taskteam after linking. content was swaped
1383 
1384     /* OMPT implicit task begin */
1385     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1386     if (ompt_enabled.ompt_callback_implicit_task) {
1387       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1388           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1389           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1390     }
1391 
1392     /* OMPT state */
1393     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1394     OMPT_CUR_TASK_INFO(this_thr)
1395         ->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1);
1396   }
1397 #endif
1398 }
1399 
1400 /* most of the work for a fork */
1401 /* return true if we really went parallel, false if serialized */
1402 int __kmp_fork_call(ident_t *loc, int gtid,
1403                     enum fork_context_e call_context, // Intel, GNU, ...
1404                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1405 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1406 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1407                     va_list *ap
1408 #else
1409                     va_list ap
1410 #endif
1411                     ) {
1412   void **argv;
1413   int i;
1414   int master_tid;
1415   int master_this_cons;
1416   kmp_team_t *team;
1417   kmp_team_t *parent_team;
1418   kmp_info_t *master_th;
1419   kmp_root_t *root;
1420   int nthreads;
1421   int master_active;
1422   int master_set_numthreads;
1423   int level;
1424 #if OMP_40_ENABLED
1425   int active_level;
1426   int teams_level;
1427 #endif
1428 #if KMP_NESTED_HOT_TEAMS
1429   kmp_hot_team_ptr_t **p_hot_teams;
1430 #endif
1431   { // KMP_TIME_BLOCK
1432     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1433     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1434 
1435     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1436     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1437       /* Some systems prefer the stack for the root thread(s) to start with */
1438       /* some gap from the parent stack to prevent false sharing. */
1439       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1440       /* These 2 lines below are so this does not get optimized out */
1441       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1442         __kmp_stkpadding += (short)((kmp_int64)dummy);
1443     }
1444 
1445     /* initialize if needed */
1446     KMP_DEBUG_ASSERT(
1447         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1448     if (!TCR_4(__kmp_init_parallel))
1449       __kmp_parallel_initialize();
1450 
1451     /* setup current data */
1452     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1453     // shutdown
1454     parent_team = master_th->th.th_team;
1455     master_tid = master_th->th.th_info.ds.ds_tid;
1456     master_this_cons = master_th->th.th_local.this_construct;
1457     root = master_th->th.th_root;
1458     master_active = root->r.r_active;
1459     master_set_numthreads = master_th->th.th_set_nproc;
1460 
1461 #if OMPT_SUPPORT
1462     ompt_data_t ompt_parallel_data;
1463     ompt_parallel_data.ptr = NULL;
1464     ompt_data_t *parent_task_data;
1465     ompt_frame_t *ompt_frame;
1466     ompt_data_t *implicit_task_data;
1467     void *return_address = NULL;
1468 
1469     if (ompt_enabled.enabled) {
1470       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1471                                     NULL, NULL);
1472       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1473     }
1474 #endif
1475 
1476     // Nested level will be an index in the nested nthreads array
1477     level = parent_team->t.t_level;
1478     // used to launch non-serial teams even if nested is not allowed
1479     active_level = parent_team->t.t_active_level;
1480 #if OMP_40_ENABLED
1481     // needed to check nesting inside the teams
1482     teams_level = master_th->th.th_teams_level;
1483 #endif
1484 #if KMP_NESTED_HOT_TEAMS
1485     p_hot_teams = &master_th->th.th_hot_teams;
1486     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1487       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1488           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1489       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1490       // it is either actual or not needed (when active_level > 0)
1491       (*p_hot_teams)[0].hot_team_nth = 1;
1492     }
1493 #endif
1494 
1495 #if OMPT_SUPPORT
1496     if (ompt_enabled.enabled) {
1497       if (ompt_enabled.ompt_callback_parallel_begin) {
1498         int team_size = master_set_numthreads
1499                             ? master_set_numthreads
1500                             : get__nproc_2(parent_team, master_tid);
1501         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1502             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1503             OMPT_INVOKER(call_context), return_address);
1504       }
1505       master_th->th.ompt_thread_info.state = omp_state_overhead;
1506     }
1507 #endif
1508 
1509     master_th->th.th_ident = loc;
1510 
1511 #if OMP_40_ENABLED
1512     if (master_th->th.th_teams_microtask && ap &&
1513         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1514       // AC: This is start of parallel that is nested inside teams construct.
1515       // The team is actual (hot), all workers are ready at the fork barrier.
1516       // No lock needed to initialize the team a bit, then free workers.
1517       parent_team->t.t_ident = loc;
1518       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1519       parent_team->t.t_argc = argc;
1520       argv = (void **)parent_team->t.t_argv;
1521       for (i = argc - 1; i >= 0; --i)
1522 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1523 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1524         *argv++ = va_arg(*ap, void *);
1525 #else
1526         *argv++ = va_arg(ap, void *);
1527 #endif
1528       // Increment our nested depth levels, but not increase the serialization
1529       if (parent_team == master_th->th.th_serial_team) {
1530         // AC: we are in serialized parallel
1531         __kmpc_serialized_parallel(loc, gtid);
1532         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1533         // AC: need this in order enquiry functions work
1534         // correctly, will restore at join time
1535         parent_team->t.t_serialized--;
1536 #if OMPT_SUPPORT
1537         void *dummy;
1538         void **exit_runtime_p;
1539 
1540         ompt_lw_taskteam_t lw_taskteam;
1541 
1542         if (ompt_enabled.enabled) {
1543           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544                                   &ompt_parallel_data, return_address);
1545           exit_runtime_p =
1546               &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1547 
1548           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1549           // don't use lw_taskteam after linking. content was swaped
1550 
1551           /* OMPT implicit task begin */
1552           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1553           if (ompt_enabled.ompt_callback_implicit_task) {
1554             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1555                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1556                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1557           }
1558 
1559           /* OMPT state */
1560           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1561         } else {
1562           exit_runtime_p = &dummy;
1563         }
1564 #endif
1565 
1566         {
1567           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1568           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1569           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1570 #if OMPT_SUPPORT
1571                                  ,
1572                                  exit_runtime_p
1573 #endif
1574                                  );
1575         }
1576 
1577 #if OMPT_SUPPORT
1578         *exit_runtime_p = NULL;
1579         if (ompt_enabled.enabled) {
1580           OMPT_CUR_TASK_INFO(master_th)->frame.exit_runtime_frame = NULL;
1581           if (ompt_enabled.ompt_callback_implicit_task) {
1582             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1583                 ompt_scope_end, NULL, implicit_task_data, 1,
1584                 __kmp_tid_from_gtid(gtid));
1585           }
1586           __ompt_lw_taskteam_unlink(master_th);
1587 
1588           if (ompt_enabled.ompt_callback_parallel_end) {
1589             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1590                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1591                 OMPT_INVOKER(call_context), return_address);
1592           }
1593           master_th->th.ompt_thread_info.state = omp_state_overhead;
1594         }
1595 #endif
1596         return TRUE;
1597       }
1598 
1599       parent_team->t.t_pkfn = microtask;
1600       parent_team->t.t_invoke = invoker;
1601       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1602       parent_team->t.t_active_level++;
1603       parent_team->t.t_level++;
1604 
1605       /* Change number of threads in the team if requested */
1606       if (master_set_numthreads) { // The parallel has num_threads clause
1607         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1608           // AC: only can reduce number of threads dynamically, can't increase
1609           kmp_info_t **other_threads = parent_team->t.t_threads;
1610           parent_team->t.t_nproc = master_set_numthreads;
1611           for (i = 0; i < master_set_numthreads; ++i) {
1612             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1613           }
1614           // Keep extra threads hot in the team for possible next parallels
1615         }
1616         master_th->th.th_set_nproc = 0;
1617       }
1618 
1619 #if USE_DEBUGGER
1620       if (__kmp_debugging) { // Let debugger override number of threads.
1621         int nth = __kmp_omp_num_threads(loc);
1622         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1623           master_set_numthreads = nth;
1624         }
1625       }
1626 #endif
1627 
1628       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1629                     "master_th=%p, gtid=%d\n",
1630                     root, parent_team, master_th, gtid));
1631       __kmp_internal_fork(loc, gtid, parent_team);
1632       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1633                     "master_th=%p, gtid=%d\n",
1634                     root, parent_team, master_th, gtid));
1635 
1636       /* Invoke microtask for MASTER thread */
1637       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1638                     parent_team->t.t_id, parent_team->t.t_pkfn));
1639 
1640       {
1641         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1642         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1643         if (!parent_team->t.t_invoke(gtid)) {
1644           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1645         }
1646       }
1647       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1648                     parent_team->t.t_id, parent_team->t.t_pkfn));
1649       KMP_MB(); /* Flush all pending memory write invalidates.  */
1650 
1651       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1652 
1653       return TRUE;
1654     } // Parallel closely nested in teams construct
1655 #endif /* OMP_40_ENABLED */
1656 
1657 #if KMP_DEBUG
1658     if (__kmp_tasking_mode != tskm_immediate_exec) {
1659       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1660                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1661     }
1662 #endif
1663 
1664     if (parent_team->t.t_active_level >=
1665         master_th->th.th_current_task->td_icvs.max_active_levels) {
1666       nthreads = 1;
1667     } else {
1668 #if OMP_40_ENABLED
1669       int enter_teams = ((ap == NULL && active_level == 0) ||
1670                          (ap && teams_level > 0 && teams_level == level));
1671 #endif
1672       nthreads =
1673           master_set_numthreads
1674               ? master_set_numthreads
1675               : get__nproc_2(
1676                     parent_team,
1677                     master_tid); // TODO: get nproc directly from current task
1678 
1679       // Check if we need to take forkjoin lock? (no need for serialized
1680       // parallel out of teams construct). This code moved here from
1681       // __kmp_reserve_threads() to speedup nested serialized parallels.
1682       if (nthreads > 1) {
1683         if ((!get__nested(master_th) && (root->r.r_in_parallel
1684 #if OMP_40_ENABLED
1685                                          && !enter_teams
1686 #endif /* OMP_40_ENABLED */
1687                                          )) ||
1688             (__kmp_library == library_serial)) {
1689           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1690                         " threads\n",
1691                         gtid, nthreads));
1692           nthreads = 1;
1693         }
1694       }
1695       if (nthreads > 1) {
1696         /* determine how many new threads we can use */
1697         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1698         nthreads = __kmp_reserve_threads(
1699             root, parent_team, master_tid, nthreads
1700 #if OMP_40_ENABLED
1701             /* AC: If we execute teams from parallel region (on host), then
1702                teams should be created but each can only have 1 thread if
1703                nesting is disabled. If teams called from serial region, then
1704                teams and their threads should be created regardless of the
1705                nesting setting. */
1706             ,
1707             enter_teams
1708 #endif /* OMP_40_ENABLED */
1709             );
1710         if (nthreads == 1) {
1711           // Free lock for single thread execution here; for multi-thread
1712           // execution it will be freed later after team of threads created
1713           // and initialized
1714           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1715         }
1716       }
1717     }
1718     KMP_DEBUG_ASSERT(nthreads > 0);
1719 
1720     // If we temporarily changed the set number of threads then restore it now
1721     master_th->th.th_set_nproc = 0;
1722 
1723     /* create a serialized parallel region? */
1724     if (nthreads == 1) {
1725 /* josh todo: hypothetical question: what do we do for OS X*? */
1726 #if KMP_OS_LINUX &&                                                            \
1727     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1728       void *args[argc];
1729 #else
1730       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1731 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1732           KMP_ARCH_AARCH64) */
1733 
1734       KA_TRACE(20,
1735                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1736 
1737       __kmpc_serialized_parallel(loc, gtid);
1738 
1739       if (call_context == fork_context_intel) {
1740         /* TODO this sucks, use the compiler itself to pass args! :) */
1741         master_th->th.th_serial_team->t.t_ident = loc;
1742 #if OMP_40_ENABLED
1743         if (!ap) {
1744           // revert change made in __kmpc_serialized_parallel()
1745           master_th->th.th_serial_team->t.t_level--;
1746 // Get args from parent team for teams construct
1747 
1748 #if OMPT_SUPPORT
1749           void *dummy;
1750           void **exit_runtime_p;
1751           ompt_task_info_t *task_info;
1752 
1753           ompt_lw_taskteam_t lw_taskteam;
1754 
1755           if (ompt_enabled.enabled) {
1756             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1757                                     &ompt_parallel_data, return_address);
1758 
1759             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1760             // don't use lw_taskteam after linking. content was swaped
1761 
1762             task_info = OMPT_CUR_TASK_INFO(master_th);
1763             exit_runtime_p = &(task_info->frame.exit_runtime_frame);
1764             if (ompt_enabled.ompt_callback_implicit_task) {
1765               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1766                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1767                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1768             }
1769 
1770             /* OMPT state */
1771             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1772           } else {
1773             exit_runtime_p = &dummy;
1774           }
1775 #endif
1776 
1777           {
1778             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1779             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1780             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1781                                    parent_team->t.t_argv
1782 #if OMPT_SUPPORT
1783                                    ,
1784                                    exit_runtime_p
1785 #endif
1786                                    );
1787           }
1788 
1789 #if OMPT_SUPPORT
1790           if (ompt_enabled.enabled) {
1791             exit_runtime_p = NULL;
1792             if (ompt_enabled.ompt_callback_implicit_task) {
1793               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1794                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1795                   __kmp_tid_from_gtid(gtid));
1796             }
1797 
1798             __ompt_lw_taskteam_unlink(master_th);
1799             if (ompt_enabled.ompt_callback_parallel_end) {
1800               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1801                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1802                   OMPT_INVOKER(call_context), return_address);
1803             }
1804             master_th->th.ompt_thread_info.state = omp_state_overhead;
1805           }
1806 #endif
1807         } else if (microtask == (microtask_t)__kmp_teams_master) {
1808           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1809                            master_th->th.th_serial_team);
1810           team = master_th->th.th_team;
1811           // team->t.t_pkfn = microtask;
1812           team->t.t_invoke = invoker;
1813           __kmp_alloc_argv_entries(argc, team, TRUE);
1814           team->t.t_argc = argc;
1815           argv = (void **)team->t.t_argv;
1816           if (ap) {
1817             for (i = argc - 1; i >= 0; --i)
1818 // TODO: revert workaround for Intel(R) 64 tracker #96
1819 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1820               *argv++ = va_arg(*ap, void *);
1821 #else
1822               *argv++ = va_arg(ap, void *);
1823 #endif
1824           } else {
1825             for (i = 0; i < argc; ++i)
1826               // Get args from parent team for teams construct
1827               argv[i] = parent_team->t.t_argv[i];
1828           }
1829           // AC: revert change made in __kmpc_serialized_parallel()
1830           //     because initial code in teams should have level=0
1831           team->t.t_level--;
1832           // AC: call special invoker for outer "parallel" of teams construct
1833           {
1834             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1835             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1836             invoker(gtid);
1837           }
1838         } else {
1839 #endif /* OMP_40_ENABLED */
1840           argv = args;
1841           for (i = argc - 1; i >= 0; --i)
1842 // TODO: revert workaround for Intel(R) 64 tracker #96
1843 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1844             *argv++ = va_arg(*ap, void *);
1845 #else
1846           *argv++ = va_arg(ap, void *);
1847 #endif
1848           KMP_MB();
1849 
1850 #if OMPT_SUPPORT
1851           void *dummy;
1852           void **exit_runtime_p;
1853           ompt_task_info_t *task_info;
1854 
1855           ompt_lw_taskteam_t lw_taskteam;
1856 
1857           if (ompt_enabled.enabled) {
1858             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1859                                     &ompt_parallel_data, return_address);
1860             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1861             // don't use lw_taskteam after linking. content was swaped
1862             task_info = OMPT_CUR_TASK_INFO(master_th);
1863             exit_runtime_p = &(task_info->frame.exit_runtime_frame);
1864 
1865             /* OMPT implicit task begin */
1866             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1867             if (ompt_enabled.ompt_callback_implicit_task) {
1868               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1869                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1870                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1871             }
1872 
1873             /* OMPT state */
1874             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1875           } else {
1876             exit_runtime_p = &dummy;
1877           }
1878 #endif
1879 
1880           {
1881             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1882             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1883             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1884 #if OMPT_SUPPORT
1885                                    ,
1886                                    exit_runtime_p
1887 #endif
1888                                    );
1889           }
1890 
1891 #if OMPT_SUPPORT
1892           if (ompt_enabled.enabled) {
1893             *exit_runtime_p = NULL;
1894             if (ompt_enabled.ompt_callback_implicit_task) {
1895               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1896                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1897                   __kmp_tid_from_gtid(gtid));
1898             }
1899 
1900             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1901             __ompt_lw_taskteam_unlink(master_th);
1902             if (ompt_enabled.ompt_callback_parallel_end) {
1903               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1904                   &ompt_parallel_data, parent_task_data,
1905                   OMPT_INVOKER(call_context), return_address);
1906             }
1907             master_th->th.ompt_thread_info.state = omp_state_overhead;
1908           }
1909 #endif
1910 #if OMP_40_ENABLED
1911         }
1912 #endif /* OMP_40_ENABLED */
1913       } else if (call_context == fork_context_gnu) {
1914 #if OMPT_SUPPORT
1915         ompt_lw_taskteam_t lwt;
1916         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1917                                 return_address);
1918 
1919         lwt.ompt_task_info.frame.exit_runtime_frame = NULL;
1920         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1921 // don't use lw_taskteam after linking. content was swaped
1922 #endif
1923 
1924         // we were called from GNU native code
1925         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1926         return FALSE;
1927       } else {
1928         KMP_ASSERT2(call_context < fork_context_last,
1929                     "__kmp_fork_call: unknown fork_context parameter");
1930       }
1931 
1932       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1933       KMP_MB();
1934       return FALSE;
1935     }
1936 
1937     // GEH: only modify the executing flag in the case when not serialized
1938     //      serialized case is handled in kmpc_serialized_parallel
1939     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1940                   "curtask=%p, curtask_max_aclevel=%d\n",
1941                   parent_team->t.t_active_level, master_th,
1942                   master_th->th.th_current_task,
1943                   master_th->th.th_current_task->td_icvs.max_active_levels));
1944     // TODO: GEH - cannot do this assertion because root thread not set up as
1945     // executing
1946     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1947     master_th->th.th_current_task->td_flags.executing = 0;
1948 
1949 #if OMP_40_ENABLED
1950     if (!master_th->th.th_teams_microtask || level > teams_level)
1951 #endif /* OMP_40_ENABLED */
1952     {
1953       /* Increment our nested depth level */
1954       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1955     }
1956 
1957     // See if we need to make a copy of the ICVs.
1958     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1959     if ((level + 1 < __kmp_nested_nth.used) &&
1960         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1961       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1962     } else {
1963       nthreads_icv = 0; // don't update
1964     }
1965 
1966 #if OMP_40_ENABLED
1967     // Figure out the proc_bind_policy for the new team.
1968     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1969     kmp_proc_bind_t proc_bind_icv =
1970         proc_bind_default; // proc_bind_default means don't update
1971     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1972       proc_bind = proc_bind_false;
1973     } else {
1974       if (proc_bind == proc_bind_default) {
1975         // No proc_bind clause specified; use current proc-bind-var for this
1976         // parallel region
1977         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1978       }
1979       /* else: The proc_bind policy was specified explicitly on parallel clause.
1980          This overrides proc-bind-var for this parallel region, but does not
1981          change proc-bind-var. */
1982       // Figure the value of proc-bind-var for the child threads.
1983       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1984           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1985            master_th->th.th_current_task->td_icvs.proc_bind)) {
1986         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1987       }
1988     }
1989 
1990     // Reset for next parallel region
1991     master_th->th.th_set_proc_bind = proc_bind_default;
1992 #endif /* OMP_40_ENABLED */
1993 
1994     if ((nthreads_icv > 0)
1995 #if OMP_40_ENABLED
1996         || (proc_bind_icv != proc_bind_default)
1997 #endif /* OMP_40_ENABLED */
1998             ) {
1999       kmp_internal_control_t new_icvs;
2000       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2001       new_icvs.next = NULL;
2002       if (nthreads_icv > 0) {
2003         new_icvs.nproc = nthreads_icv;
2004       }
2005 
2006 #if OMP_40_ENABLED
2007       if (proc_bind_icv != proc_bind_default) {
2008         new_icvs.proc_bind = proc_bind_icv;
2009       }
2010 #endif /* OMP_40_ENABLED */
2011 
2012       /* allocate a new parallel team */
2013       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2014       team = __kmp_allocate_team(root, nthreads, nthreads,
2015 #if OMPT_SUPPORT
2016                                  ompt_parallel_data,
2017 #endif
2018 #if OMP_40_ENABLED
2019                                  proc_bind,
2020 #endif
2021                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2022     } else {
2023       /* allocate a new parallel team */
2024       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2025       team = __kmp_allocate_team(root, nthreads, nthreads,
2026 #if OMPT_SUPPORT
2027                                  ompt_parallel_data,
2028 #endif
2029 #if OMP_40_ENABLED
2030                                  proc_bind,
2031 #endif
2032                                  &master_th->th.th_current_task->td_icvs,
2033                                  argc USE_NESTED_HOT_ARG(master_th));
2034     }
2035     KF_TRACE(
2036         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2037 
2038     /* setup the new team */
2039     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2040     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2041     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2042     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2043     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2044 #if OMPT_SUPPORT
2045     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2046                           return_address);
2047 #endif
2048     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2049 // TODO: parent_team->t.t_level == INT_MAX ???
2050 #if OMP_40_ENABLED
2051     if (!master_th->th.th_teams_microtask || level > teams_level) {
2052 #endif /* OMP_40_ENABLED */
2053       int new_level = parent_team->t.t_level + 1;
2054       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055       new_level = parent_team->t.t_active_level + 1;
2056       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057 #if OMP_40_ENABLED
2058     } else {
2059       // AC: Do not increase parallel level at start of the teams construct
2060       int new_level = parent_team->t.t_level;
2061       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2062       new_level = parent_team->t.t_active_level;
2063       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2064     }
2065 #endif /* OMP_40_ENABLED */
2066     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2067     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
2068         team->t.t_sched.chunk != new_sched.chunk)
2069       team->t.t_sched =
2070           new_sched; // set master's schedule as new run-time schedule
2071 
2072 #if OMP_40_ENABLED
2073     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2074 #endif
2075 
2076     // Update the floating point rounding in the team if required.
2077     propagateFPControl(team);
2078 
2079     if (__kmp_tasking_mode != tskm_immediate_exec) {
2080       // Set master's task team to team's task team. Unless this is hot team, it
2081       // should be NULL.
2082       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2083                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2084       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2085                     "%p, new task_team %p / team %p\n",
2086                     __kmp_gtid_from_thread(master_th),
2087                     master_th->th.th_task_team, parent_team,
2088                     team->t.t_task_team[master_th->th.th_task_state], team));
2089 
2090       if (active_level || master_th->th.th_task_team) {
2091         // Take a memo of master's task_state
2092         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2093         if (master_th->th.th_task_state_top >=
2094             master_th->th.th_task_state_stack_sz) { // increase size
2095           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2096           kmp_uint8 *old_stack, *new_stack;
2097           kmp_uint32 i;
2098           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2099           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2100             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2101           }
2102           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2103                ++i) { // zero-init rest of stack
2104             new_stack[i] = 0;
2105           }
2106           old_stack = master_th->th.th_task_state_memo_stack;
2107           master_th->th.th_task_state_memo_stack = new_stack;
2108           master_th->th.th_task_state_stack_sz = new_size;
2109           __kmp_free(old_stack);
2110         }
2111         // Store master's task_state on stack
2112         master_th->th
2113             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2114             master_th->th.th_task_state;
2115         master_th->th.th_task_state_top++;
2116 #if KMP_NESTED_HOT_TEAMS
2117         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2118           // Restore master's nested state if nested hot team
2119           master_th->th.th_task_state =
2120               master_th->th
2121                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2122         } else {
2123 #endif
2124           master_th->th.th_task_state = 0;
2125 #if KMP_NESTED_HOT_TEAMS
2126         }
2127 #endif
2128       }
2129 #if !KMP_NESTED_HOT_TEAMS
2130       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2131                        (team == root->r.r_hot_team));
2132 #endif
2133     }
2134 
2135     KA_TRACE(
2136         20,
2137         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2138          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2139          team->t.t_nproc));
2140     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2141                      (team->t.t_master_tid == 0 &&
2142                       (team->t.t_parent == root->r.r_root_team ||
2143                        team->t.t_parent->t.t_serialized)));
2144     KMP_MB();
2145 
2146     /* now, setup the arguments */
2147     argv = (void **)team->t.t_argv;
2148 #if OMP_40_ENABLED
2149     if (ap) {
2150 #endif /* OMP_40_ENABLED */
2151       for (i = argc - 1; i >= 0; --i) {
2152 // TODO: revert workaround for Intel(R) 64 tracker #96
2153 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2154         void *new_argv = va_arg(*ap, void *);
2155 #else
2156       void *new_argv = va_arg(ap, void *);
2157 #endif
2158         KMP_CHECK_UPDATE(*argv, new_argv);
2159         argv++;
2160       }
2161 #if OMP_40_ENABLED
2162     } else {
2163       for (i = 0; i < argc; ++i) {
2164         // Get args from parent team for teams construct
2165         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2166       }
2167     }
2168 #endif /* OMP_40_ENABLED */
2169 
2170     /* now actually fork the threads */
2171     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2172     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2173       root->r.r_active = TRUE;
2174 
2175     __kmp_fork_team_threads(root, team, master_th, gtid);
2176     __kmp_setup_icv_copy(team, nthreads,
2177                          &master_th->th.th_current_task->td_icvs, loc);
2178 
2179 #if OMPT_SUPPORT
2180     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2181 #endif
2182 
2183     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2184 
2185 #if USE_ITT_BUILD
2186     if (team->t.t_active_level == 1 // only report frames at level 1
2187 #if OMP_40_ENABLED
2188         && !master_th->th.th_teams_microtask // not in teams construct
2189 #endif /* OMP_40_ENABLED */
2190         ) {
2191 #if USE_ITT_NOTIFY
2192       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2193           (__kmp_forkjoin_frames_mode == 3 ||
2194            __kmp_forkjoin_frames_mode == 1)) {
2195         kmp_uint64 tmp_time = 0;
2196         if (__itt_get_timestamp_ptr)
2197           tmp_time = __itt_get_timestamp();
2198         // Internal fork - report frame begin
2199         master_th->th.th_frame_time = tmp_time;
2200         if (__kmp_forkjoin_frames_mode == 3)
2201           team->t.t_region_time = tmp_time;
2202       } else
2203 // only one notification scheme (either "submit" or "forking/joined", not both)
2204 #endif /* USE_ITT_NOTIFY */
2205           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2206               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2207         // Mark start of "parallel" region for VTune.
2208         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2209       }
2210     }
2211 #endif /* USE_ITT_BUILD */
2212 
2213     /* now go on and do the work */
2214     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2215     KMP_MB();
2216     KF_TRACE(10,
2217              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2218               root, team, master_th, gtid));
2219 
2220 #if USE_ITT_BUILD
2221     if (__itt_stack_caller_create_ptr) {
2222       team->t.t_stack_id =
2223           __kmp_itt_stack_caller_create(); // create new stack stitching id
2224       // before entering fork barrier
2225     }
2226 #endif /* USE_ITT_BUILD */
2227 
2228 #if OMP_40_ENABLED
2229     // AC: skip __kmp_internal_fork at teams construct, let only master
2230     // threads execute
2231     if (ap)
2232 #endif /* OMP_40_ENABLED */
2233     {
2234       __kmp_internal_fork(loc, gtid, team);
2235       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2236                     "master_th=%p, gtid=%d\n",
2237                     root, team, master_th, gtid));
2238     }
2239 
2240     if (call_context == fork_context_gnu) {
2241       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2242       return TRUE;
2243     }
2244 
2245     /* Invoke microtask for MASTER thread */
2246     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2247                   team->t.t_id, team->t.t_pkfn));
2248   } // END of timer KMP_fork_call block
2249 
2250   {
2251     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2252     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2253     if (!team->t.t_invoke(gtid)) {
2254       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2255     }
2256   }
2257   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2258                 team->t.t_id, team->t.t_pkfn));
2259   KMP_MB(); /* Flush all pending memory write invalidates.  */
2260 
2261   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2262 
2263 #if OMPT_SUPPORT
2264   if (ompt_enabled.enabled) {
2265     master_th->th.ompt_thread_info.state = omp_state_overhead;
2266   }
2267 #endif
2268 
2269   return TRUE;
2270 }
2271 
2272 #if OMPT_SUPPORT
2273 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2274                                             kmp_team_t *team) {
2275   // restore state outside the region
2276   thread->th.ompt_thread_info.state =
2277       ((team->t.t_serialized) ? omp_state_work_serial
2278                               : omp_state_work_parallel);
2279 }
2280 
2281 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2282                                    kmp_team_t *team, ompt_data_t *parallel_data,
2283                                    fork_context_e fork_context, void *codeptr) {
2284   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2285   if (ompt_enabled.ompt_callback_parallel_end) {
2286     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2287         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2288         codeptr);
2289   }
2290 
2291   task_info->frame.reenter_runtime_frame = NULL;
2292   __kmp_join_restore_state(thread, team);
2293 }
2294 #endif
2295 
2296 void __kmp_join_call(ident_t *loc, int gtid
2297 #if OMPT_SUPPORT
2298                      ,
2299                      enum fork_context_e fork_context
2300 #endif
2301 #if OMP_40_ENABLED
2302                      ,
2303                      int exit_teams
2304 #endif /* OMP_40_ENABLED */
2305                      ) {
2306   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2307   kmp_team_t *team;
2308   kmp_team_t *parent_team;
2309   kmp_info_t *master_th;
2310   kmp_root_t *root;
2311   int master_active;
2312   int i;
2313 
2314   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2315 
2316   /* setup current data */
2317   master_th = __kmp_threads[gtid];
2318   root = master_th->th.th_root;
2319   team = master_th->th.th_team;
2320   parent_team = team->t.t_parent;
2321 
2322   master_th->th.th_ident = loc;
2323 
2324 #if OMPT_SUPPORT
2325   if (ompt_enabled.enabled) {
2326     master_th->th.ompt_thread_info.state = omp_state_overhead;
2327   }
2328 #endif
2329 
2330 #if KMP_DEBUG
2331   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2332     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2333                   "th_task_team = %p\n",
2334                   __kmp_gtid_from_thread(master_th), team,
2335                   team->t.t_task_team[master_th->th.th_task_state],
2336                   master_th->th.th_task_team));
2337     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2338                      team->t.t_task_team[master_th->th.th_task_state]);
2339   }
2340 #endif
2341 
2342   if (team->t.t_serialized) {
2343 #if OMP_40_ENABLED
2344     if (master_th->th.th_teams_microtask) {
2345       // We are in teams construct
2346       int level = team->t.t_level;
2347       int tlevel = master_th->th.th_teams_level;
2348       if (level == tlevel) {
2349         // AC: we haven't incremented it earlier at start of teams construct,
2350         //     so do it here - at the end of teams construct
2351         team->t.t_level++;
2352       } else if (level == tlevel + 1) {
2353         // AC: we are exiting parallel inside teams, need to increment
2354         // serialization in order to restore it in the next call to
2355         // __kmpc_end_serialized_parallel
2356         team->t.t_serialized++;
2357       }
2358     }
2359 #endif /* OMP_40_ENABLED */
2360     __kmpc_end_serialized_parallel(loc, gtid);
2361 
2362 #if OMPT_SUPPORT
2363     if (ompt_enabled.enabled) {
2364       __kmp_join_restore_state(master_th, parent_team);
2365     }
2366 #endif
2367 
2368     return;
2369   }
2370 
2371   master_active = team->t.t_master_active;
2372 
2373 #if OMP_40_ENABLED
2374   if (!exit_teams)
2375 #endif /* OMP_40_ENABLED */
2376   {
2377     // AC: No barrier for internal teams at exit from teams construct.
2378     //     But there is barrier for external team (league).
2379     __kmp_internal_join(loc, gtid, team);
2380   }
2381 #if OMP_40_ENABLED
2382   else {
2383     master_th->th.th_task_state =
2384         0; // AC: no tasking in teams (out of any parallel)
2385   }
2386 #endif /* OMP_40_ENABLED */
2387 
2388   KMP_MB();
2389 
2390 #if OMPT_SUPPORT
2391   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2392   void *codeptr = team->t.ompt_team_info.master_return_address;
2393 #endif
2394 
2395 #if USE_ITT_BUILD
2396   if (__itt_stack_caller_create_ptr) {
2397     __kmp_itt_stack_caller_destroy(
2398         (__itt_caller)team->t
2399             .t_stack_id); // destroy the stack stitching id after join barrier
2400   }
2401 
2402   // Mark end of "parallel" region for VTune.
2403   if (team->t.t_active_level == 1
2404 #if OMP_40_ENABLED
2405       && !master_th->th.th_teams_microtask /* not in teams construct */
2406 #endif /* OMP_40_ENABLED */
2407       ) {
2408     master_th->th.th_ident = loc;
2409     // only one notification scheme (either "submit" or "forking/joined", not
2410     // both)
2411     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2412         __kmp_forkjoin_frames_mode == 3)
2413       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2414                              master_th->th.th_frame_time, 0, loc,
2415                              master_th->th.th_team_nproc, 1);
2416     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2417              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2418       __kmp_itt_region_joined(gtid);
2419   } // active_level == 1
2420 #endif /* USE_ITT_BUILD */
2421 
2422 #if OMP_40_ENABLED
2423   if (master_th->th.th_teams_microtask && !exit_teams &&
2424       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2425       team->t.t_level == master_th->th.th_teams_level + 1) {
2426     // AC: We need to leave the team structure intact at the end of parallel
2427     // inside the teams construct, so that at the next parallel same (hot) team
2428     // works, only adjust nesting levels
2429 
2430     /* Decrement our nested depth level */
2431     team->t.t_level--;
2432     team->t.t_active_level--;
2433     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2434 
2435     /* Restore number of threads in the team if needed */
2436     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2437       int old_num = master_th->th.th_team_nproc;
2438       int new_num = master_th->th.th_teams_size.nth;
2439       kmp_info_t **other_threads = team->t.t_threads;
2440       team->t.t_nproc = new_num;
2441       for (i = 0; i < old_num; ++i) {
2442         other_threads[i]->th.th_team_nproc = new_num;
2443       }
2444       // Adjust states of non-used threads of the team
2445       for (i = old_num; i < new_num; ++i) {
2446         // Re-initialize thread's barrier data.
2447         int b;
2448         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2449         for (b = 0; b < bs_last_barrier; ++b) {
2450           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2451           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2452 #if USE_DEBUGGER
2453           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2454 #endif
2455         }
2456         if (__kmp_tasking_mode != tskm_immediate_exec) {
2457           // Synchronize thread's task state
2458           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2459         }
2460       }
2461     }
2462 
2463 #if OMPT_SUPPORT
2464     if (ompt_enabled.enabled) {
2465       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2466                       codeptr);
2467     }
2468 #endif
2469 
2470     return;
2471   }
2472 #endif /* OMP_40_ENABLED */
2473 
2474   /* do cleanup and restore the parent team */
2475   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2476   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2477 
2478   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2479 
2480   /* jc: The following lock has instructions with REL and ACQ semantics,
2481      separating the parallel user code called in this parallel region
2482      from the serial user code called after this function returns. */
2483   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2484 
2485 #if OMP_40_ENABLED
2486   if (!master_th->th.th_teams_microtask ||
2487       team->t.t_level > master_th->th.th_teams_level)
2488 #endif /* OMP_40_ENABLED */
2489   {
2490     /* Decrement our nested depth level */
2491     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2492   }
2493   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2494 
2495 #if OMPT_SUPPORT
2496   if (ompt_enabled.enabled) {
2497     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2498     if (ompt_enabled.ompt_callback_implicit_task) {
2499       int ompt_team_size = team->t.t_nproc;
2500       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2501           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2502           __kmp_tid_from_gtid(gtid));
2503     }
2504 
2505     task_info->frame.exit_runtime_frame = NULL;
2506     task_info->task_data = ompt_data_none;
2507   }
2508 #endif
2509 
2510   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2511                 master_th, team));
2512   __kmp_pop_current_task_from_thread(master_th);
2513 
2514 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2515   // Restore master thread's partition.
2516   master_th->th.th_first_place = team->t.t_first_place;
2517   master_th->th.th_last_place = team->t.t_last_place;
2518 #endif /* OMP_40_ENABLED */
2519 
2520   updateHWFPControl(team);
2521 
2522   if (root->r.r_active != master_active)
2523     root->r.r_active = master_active;
2524 
2525   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2526                             master_th)); // this will free worker threads
2527 
2528   /* this race was fun to find. make sure the following is in the critical
2529      region otherwise assertions may fail occasionally since the old team may be
2530      reallocated and the hierarchy appears inconsistent. it is actually safe to
2531      run and won't cause any bugs, but will cause those assertion failures. it's
2532      only one deref&assign so might as well put this in the critical region */
2533   master_th->th.th_team = parent_team;
2534   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2535   master_th->th.th_team_master = parent_team->t.t_threads[0];
2536   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2537 
2538   /* restore serialized team, if need be */
2539   if (parent_team->t.t_serialized &&
2540       parent_team != master_th->th.th_serial_team &&
2541       parent_team != root->r.r_root_team) {
2542     __kmp_free_team(root,
2543                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2544     master_th->th.th_serial_team = parent_team;
2545   }
2546 
2547   if (__kmp_tasking_mode != tskm_immediate_exec) {
2548     if (master_th->th.th_task_state_top >
2549         0) { // Restore task state from memo stack
2550       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2551       // Remember master's state if we re-use this nested hot team
2552       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2553           master_th->th.th_task_state;
2554       --master_th->th.th_task_state_top; // pop
2555       // Now restore state at this level
2556       master_th->th.th_task_state =
2557           master_th->th
2558               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2559     }
2560     // Copy the task team from the parent team to the master thread
2561     master_th->th.th_task_team =
2562         parent_team->t.t_task_team[master_th->th.th_task_state];
2563     KA_TRACE(20,
2564              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2565               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2566               parent_team));
2567   }
2568 
2569   // TODO: GEH - cannot do this assertion because root thread not set up as
2570   // executing
2571   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2572   master_th->th.th_current_task->td_flags.executing = 1;
2573 
2574   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2575 
2576 #if OMPT_SUPPORT
2577   if (ompt_enabled.enabled) {
2578     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2579                     codeptr);
2580   }
2581 #endif
2582 
2583   KMP_MB();
2584   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2585 }
2586 
2587 /* Check whether we should push an internal control record onto the
2588    serial team stack.  If so, do it.  */
2589 void __kmp_save_internal_controls(kmp_info_t *thread) {
2590 
2591   if (thread->th.th_team != thread->th.th_serial_team) {
2592     return;
2593   }
2594   if (thread->th.th_team->t.t_serialized > 1) {
2595     int push = 0;
2596 
2597     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2598       push = 1;
2599     } else {
2600       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2601           thread->th.th_team->t.t_serialized) {
2602         push = 1;
2603       }
2604     }
2605     if (push) { /* push a record on the serial team's stack */
2606       kmp_internal_control_t *control =
2607           (kmp_internal_control_t *)__kmp_allocate(
2608               sizeof(kmp_internal_control_t));
2609 
2610       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2611 
2612       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2613 
2614       control->next = thread->th.th_team->t.t_control_stack_top;
2615       thread->th.th_team->t.t_control_stack_top = control;
2616     }
2617   }
2618 }
2619 
2620 /* Changes set_nproc */
2621 void __kmp_set_num_threads(int new_nth, int gtid) {
2622   kmp_info_t *thread;
2623   kmp_root_t *root;
2624 
2625   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2626   KMP_DEBUG_ASSERT(__kmp_init_serial);
2627 
2628   if (new_nth < 1)
2629     new_nth = 1;
2630   else if (new_nth > __kmp_max_nth)
2631     new_nth = __kmp_max_nth;
2632 
2633   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2634   thread = __kmp_threads[gtid];
2635 
2636   __kmp_save_internal_controls(thread);
2637 
2638   set__nproc(thread, new_nth);
2639 
2640   // If this omp_set_num_threads() call will cause the hot team size to be
2641   // reduced (in the absence of a num_threads clause), then reduce it now,
2642   // rather than waiting for the next parallel region.
2643   root = thread->th.th_root;
2644   if (__kmp_init_parallel && (!root->r.r_active) &&
2645       (root->r.r_hot_team->t.t_nproc > new_nth)
2646 #if KMP_NESTED_HOT_TEAMS
2647       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2648 #endif
2649       ) {
2650     kmp_team_t *hot_team = root->r.r_hot_team;
2651     int f;
2652 
2653     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2654 
2655     // Release the extra threads we don't need any more.
2656     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2657       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2658       if (__kmp_tasking_mode != tskm_immediate_exec) {
2659         // When decreasing team size, threads no longer in the team should unref
2660         // task team.
2661         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2662       }
2663       __kmp_free_thread(hot_team->t.t_threads[f]);
2664       hot_team->t.t_threads[f] = NULL;
2665     }
2666     hot_team->t.t_nproc = new_nth;
2667 #if KMP_NESTED_HOT_TEAMS
2668     if (thread->th.th_hot_teams) {
2669       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2670       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2671     }
2672 #endif
2673 
2674     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2675 
2676     // Update the t_nproc field in the threads that are still active.
2677     for (f = 0; f < new_nth; f++) {
2678       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2679       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2680     }
2681     // Special flag in case omp_set_num_threads() call
2682     hot_team->t.t_size_changed = -1;
2683   }
2684 }
2685 
2686 /* Changes max_active_levels */
2687 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2688   kmp_info_t *thread;
2689 
2690   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2691                 "%d = (%d)\n",
2692                 gtid, max_active_levels));
2693   KMP_DEBUG_ASSERT(__kmp_init_serial);
2694 
2695   // validate max_active_levels
2696   if (max_active_levels < 0) {
2697     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2698     // We ignore this call if the user has specified a negative value.
2699     // The current setting won't be changed. The last valid setting will be
2700     // used. A warning will be issued (if warnings are allowed as controlled by
2701     // the KMP_WARNINGS env var).
2702     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2703                   "max_active_levels for thread %d = (%d)\n",
2704                   gtid, max_active_levels));
2705     return;
2706   }
2707   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2708     // it's OK, the max_active_levels is within the valid range: [ 0;
2709     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2710     // We allow a zero value. (implementation defined behavior)
2711   } else {
2712     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2713                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2714     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2715     // Current upper limit is MAX_INT. (implementation defined behavior)
2716     // If the input exceeds the upper limit, we correct the input to be the
2717     // upper limit. (implementation defined behavior)
2718     // Actually, the flow should never get here until we use MAX_INT limit.
2719   }
2720   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2721                 "max_active_levels for thread %d = (%d)\n",
2722                 gtid, max_active_levels));
2723 
2724   thread = __kmp_threads[gtid];
2725 
2726   __kmp_save_internal_controls(thread);
2727 
2728   set__max_active_levels(thread, max_active_levels);
2729 }
2730 
2731 /* Gets max_active_levels */
2732 int __kmp_get_max_active_levels(int gtid) {
2733   kmp_info_t *thread;
2734 
2735   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2736   KMP_DEBUG_ASSERT(__kmp_init_serial);
2737 
2738   thread = __kmp_threads[gtid];
2739   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2740   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2741                 "curtask_maxaclevel=%d\n",
2742                 gtid, thread->th.th_current_task,
2743                 thread->th.th_current_task->td_icvs.max_active_levels));
2744   return thread->th.th_current_task->td_icvs.max_active_levels;
2745 }
2746 
2747 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2748 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2749   kmp_info_t *thread;
2750   //    kmp_team_t *team;
2751 
2752   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2753                 gtid, (int)kind, chunk));
2754   KMP_DEBUG_ASSERT(__kmp_init_serial);
2755 
2756   // Check if the kind parameter is valid, correct if needed.
2757   // Valid parameters should fit in one of two intervals - standard or extended:
2758   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2759   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2760   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2761       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2762     // TODO: Hint needs attention in case we change the default schedule.
2763     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2764               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2765               __kmp_msg_null);
2766     kind = kmp_sched_default;
2767     chunk = 0; // ignore chunk value in case of bad kind
2768   }
2769 
2770   thread = __kmp_threads[gtid];
2771 
2772   __kmp_save_internal_controls(thread);
2773 
2774   if (kind < kmp_sched_upper_std) {
2775     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2776       // differ static chunked vs. unchunked:  chunk should be invalid to
2777       // indicate unchunked schedule (which is the default)
2778       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2779     } else {
2780       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2781           __kmp_sch_map[kind - kmp_sched_lower - 1];
2782     }
2783   } else {
2784     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2785     //    kmp_sched_lower - 2 ];
2786     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2787         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2788                       kmp_sched_lower - 2];
2789   }
2790   if (kind == kmp_sched_auto || chunk < 1) {
2791     // ignore parameter chunk for schedule auto
2792     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2793   } else {
2794     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2795   }
2796 }
2797 
2798 /* Gets def_sched_var ICV values */
2799 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2800   kmp_info_t *thread;
2801   enum sched_type th_type;
2802 
2803   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2804   KMP_DEBUG_ASSERT(__kmp_init_serial);
2805 
2806   thread = __kmp_threads[gtid];
2807 
2808   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2809 
2810   switch (th_type) {
2811   case kmp_sch_static:
2812   case kmp_sch_static_greedy:
2813   case kmp_sch_static_balanced:
2814     *kind = kmp_sched_static;
2815     *chunk = 0; // chunk was not set, try to show this fact via zero value
2816     return;
2817   case kmp_sch_static_chunked:
2818     *kind = kmp_sched_static;
2819     break;
2820   case kmp_sch_dynamic_chunked:
2821     *kind = kmp_sched_dynamic;
2822     break;
2823   case kmp_sch_guided_chunked:
2824   case kmp_sch_guided_iterative_chunked:
2825   case kmp_sch_guided_analytical_chunked:
2826     *kind = kmp_sched_guided;
2827     break;
2828   case kmp_sch_auto:
2829     *kind = kmp_sched_auto;
2830     break;
2831   case kmp_sch_trapezoidal:
2832     *kind = kmp_sched_trapezoidal;
2833     break;
2834 #if KMP_STATIC_STEAL_ENABLED
2835   case kmp_sch_static_steal:
2836     *kind = kmp_sched_static_steal;
2837     break;
2838 #endif
2839   default:
2840     KMP_FATAL(UnknownSchedulingType, th_type);
2841   }
2842 
2843   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2844 }
2845 
2846 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2847 
2848   int ii, dd;
2849   kmp_team_t *team;
2850   kmp_info_t *thr;
2851 
2852   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2853   KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855   // validate level
2856   if (level == 0)
2857     return 0;
2858   if (level < 0)
2859     return -1;
2860   thr = __kmp_threads[gtid];
2861   team = thr->th.th_team;
2862   ii = team->t.t_level;
2863   if (level > ii)
2864     return -1;
2865 
2866 #if OMP_40_ENABLED
2867   if (thr->th.th_teams_microtask) {
2868     // AC: we are in teams region where multiple nested teams have same level
2869     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2870     if (level <=
2871         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2872       KMP_DEBUG_ASSERT(ii >= tlevel);
2873       // AC: As we need to pass by the teams league, we need to artificially
2874       // increase ii
2875       if (ii == tlevel) {
2876         ii += 2; // three teams have same level
2877       } else {
2878         ii++; // two teams have same level
2879       }
2880     }
2881   }
2882 #endif
2883 
2884   if (ii == level)
2885     return __kmp_tid_from_gtid(gtid);
2886 
2887   dd = team->t.t_serialized;
2888   level++;
2889   while (ii > level) {
2890     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2891     }
2892     if ((team->t.t_serialized) && (!dd)) {
2893       team = team->t.t_parent;
2894       continue;
2895     }
2896     if (ii > level) {
2897       team = team->t.t_parent;
2898       dd = team->t.t_serialized;
2899       ii--;
2900     }
2901   }
2902 
2903   return (dd > 1) ? (0) : (team->t.t_master_tid);
2904 }
2905 
2906 int __kmp_get_team_size(int gtid, int level) {
2907 
2908   int ii, dd;
2909   kmp_team_t *team;
2910   kmp_info_t *thr;
2911 
2912   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2913   KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915   // validate level
2916   if (level == 0)
2917     return 1;
2918   if (level < 0)
2919     return -1;
2920   thr = __kmp_threads[gtid];
2921   team = thr->th.th_team;
2922   ii = team->t.t_level;
2923   if (level > ii)
2924     return -1;
2925 
2926 #if OMP_40_ENABLED
2927   if (thr->th.th_teams_microtask) {
2928     // AC: we are in teams region where multiple nested teams have same level
2929     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2930     if (level <=
2931         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2932       KMP_DEBUG_ASSERT(ii >= tlevel);
2933       // AC: As we need to pass by the teams league, we need to artificially
2934       // increase ii
2935       if (ii == tlevel) {
2936         ii += 2; // three teams have same level
2937       } else {
2938         ii++; // two teams have same level
2939       }
2940     }
2941   }
2942 #endif
2943 
2944   while (ii > level) {
2945     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2946     }
2947     if (team->t.t_serialized && (!dd)) {
2948       team = team->t.t_parent;
2949       continue;
2950     }
2951     if (ii > level) {
2952       team = team->t.t_parent;
2953       ii--;
2954     }
2955   }
2956 
2957   return team->t.t_nproc;
2958 }
2959 
2960 kmp_r_sched_t __kmp_get_schedule_global() {
2961   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2962   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2963   // independently. So one can get the updated schedule here.
2964 
2965   kmp_r_sched_t r_sched;
2966 
2967   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2968   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2969   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2970   // different roots (even in OMP 2.5)
2971   if (__kmp_sched == kmp_sch_static) {
2972     r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed
2973     // schedule (balanced or greedy)
2974   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2975     r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed
2976     // schedule (iterative or analytical)
2977   } else {
2978     r_sched.r_sched_type =
2979         __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2980   }
2981 
2982   if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it
2983     // was not ever set)
2984     r_sched.chunk = KMP_DEFAULT_CHUNK;
2985   } else {
2986     r_sched.chunk = __kmp_chunk;
2987   }
2988 
2989   return r_sched;
2990 }
2991 
2992 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2993    at least argc number of *t_argv entries for the requested team. */
2994 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2995 
2996   KMP_DEBUG_ASSERT(team);
2997   if (!realloc || argc > team->t.t_max_argc) {
2998 
2999     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3000                    "current entries=%d\n",
3001                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3002     /* if previously allocated heap space for args, free them */
3003     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3004       __kmp_free((void *)team->t.t_argv);
3005 
3006     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3007       /* use unused space in the cache line for arguments */
3008       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3009       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3010                      "argv entries\n",
3011                      team->t.t_id, team->t.t_max_argc));
3012       team->t.t_argv = &team->t.t_inline_argv[0];
3013       if (__kmp_storage_map) {
3014         __kmp_print_storage_map_gtid(
3015             -1, &team->t.t_inline_argv[0],
3016             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3017             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3018             team->t.t_id);
3019       }
3020     } else {
3021       /* allocate space for arguments in the heap */
3022       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3023                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3024                                : 2 * argc;
3025       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3026                      "argv entries\n",
3027                      team->t.t_id, team->t.t_max_argc));
3028       team->t.t_argv =
3029           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3032                                      &team->t.t_argv[team->t.t_max_argc],
3033                                      sizeof(void *) * team->t.t_max_argc,
3034                                      "team_%d.t_argv", team->t.t_id);
3035       }
3036     }
3037   }
3038 }
3039 
3040 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3041   int i;
3042   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3043   team->t.t_threads =
3044       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3045   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3046       sizeof(dispatch_shared_info_t) * num_disp_buff);
3047   team->t.t_dispatch =
3048       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3049   team->t.t_implicit_task_taskdata =
3050       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3051   team->t.t_max_nproc = max_nth;
3052 
3053   /* setup dispatch buffers */
3054   for (i = 0; i < num_disp_buff; ++i) {
3055     team->t.t_disp_buffer[i].buffer_index = i;
3056 #if OMP_45_ENABLED
3057     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3058 #endif
3059   }
3060 }
3061 
3062 static void __kmp_free_team_arrays(kmp_team_t *team) {
3063   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3064   int i;
3065   for (i = 0; i < team->t.t_max_nproc; ++i) {
3066     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3067       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3068       team->t.t_dispatch[i].th_disp_buffer = NULL;
3069     }
3070   }
3071   __kmp_free(team->t.t_threads);
3072   __kmp_free(team->t.t_disp_buffer);
3073   __kmp_free(team->t.t_dispatch);
3074   __kmp_free(team->t.t_implicit_task_taskdata);
3075   team->t.t_threads = NULL;
3076   team->t.t_disp_buffer = NULL;
3077   team->t.t_dispatch = NULL;
3078   team->t.t_implicit_task_taskdata = 0;
3079 }
3080 
3081 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3082   kmp_info_t **oldThreads = team->t.t_threads;
3083 
3084   __kmp_free(team->t.t_disp_buffer);
3085   __kmp_free(team->t.t_dispatch);
3086   __kmp_free(team->t.t_implicit_task_taskdata);
3087   __kmp_allocate_team_arrays(team, max_nth);
3088 
3089   KMP_MEMCPY(team->t.t_threads, oldThreads,
3090              team->t.t_nproc * sizeof(kmp_info_t *));
3091 
3092   __kmp_free(oldThreads);
3093 }
3094 
3095 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3096 
3097   kmp_r_sched_t r_sched =
3098       __kmp_get_schedule_global(); // get current state of scheduling globals
3099 
3100 #if OMP_40_ENABLED
3101   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3102 #endif /* OMP_40_ENABLED */
3103 
3104   kmp_internal_control_t g_icvs = {
3105     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3106     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3107     // for nested parallelism (per thread)
3108     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3109     // adjustment of threads (per thread)
3110     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3111     // whether blocktime is explicitly set
3112     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3113 #if KMP_USE_MONITOR
3114     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3115 // intervals
3116 #endif
3117     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3118     // next parallel region (per thread)
3119     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3120     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3121     // for max_active_levels
3122     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3123 // {sched,chunk} pair
3124 #if OMP_40_ENABLED
3125     __kmp_nested_proc_bind.bind_types[0],
3126     __kmp_default_device,
3127 #endif /* OMP_40_ENABLED */
3128     NULL // struct kmp_internal_control *next;
3129   };
3130 
3131   return g_icvs;
3132 }
3133 
3134 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3135 
3136   kmp_internal_control_t gx_icvs;
3137   gx_icvs.serial_nesting_level =
3138       0; // probably =team->t.t_serial like in save_inter_controls
3139   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3140   gx_icvs.next = NULL;
3141 
3142   return gx_icvs;
3143 }
3144 
3145 static void __kmp_initialize_root(kmp_root_t *root) {
3146   int f;
3147   kmp_team_t *root_team;
3148   kmp_team_t *hot_team;
3149   int hot_team_max_nth;
3150   kmp_r_sched_t r_sched =
3151       __kmp_get_schedule_global(); // get current state of scheduling globals
3152   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3153   KMP_DEBUG_ASSERT(root);
3154   KMP_ASSERT(!root->r.r_begin);
3155 
3156   /* setup the root state structure */
3157   __kmp_init_lock(&root->r.r_begin_lock);
3158   root->r.r_begin = FALSE;
3159   root->r.r_active = FALSE;
3160   root->r.r_in_parallel = 0;
3161   root->r.r_blocktime = __kmp_dflt_blocktime;
3162   root->r.r_nested = __kmp_dflt_nested;
3163   root->r.r_cg_nthreads = 1;
3164 
3165   /* setup the root team for this task */
3166   /* allocate the root team structure */
3167   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3168 
3169   root_team =
3170       __kmp_allocate_team(root,
3171                           1, // new_nproc
3172                           1, // max_nproc
3173 #if OMPT_SUPPORT
3174                           ompt_data_none, // root parallel id
3175 #endif
3176 #if OMP_40_ENABLED
3177                           __kmp_nested_proc_bind.bind_types[0],
3178 #endif
3179                           &r_icvs,
3180                           0 // argc
3181                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3182                           );
3183 #if USE_DEBUGGER
3184   // Non-NULL value should be assigned to make the debugger display the root
3185   // team.
3186   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3187 #endif
3188 
3189   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3190 
3191   root->r.r_root_team = root_team;
3192   root_team->t.t_control_stack_top = NULL;
3193 
3194   /* initialize root team */
3195   root_team->t.t_threads[0] = NULL;
3196   root_team->t.t_nproc = 1;
3197   root_team->t.t_serialized = 1;
3198   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3199   root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3200   root_team->t.t_sched.chunk = r_sched.chunk;
3201   KA_TRACE(
3202       20,
3203       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3204        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3205 
3206   /* setup the  hot team for this task */
3207   /* allocate the hot team structure */
3208   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3209 
3210   hot_team =
3211       __kmp_allocate_team(root,
3212                           1, // new_nproc
3213                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3214 #if OMPT_SUPPORT
3215                           ompt_data_none, // root parallel id
3216 #endif
3217 #if OMP_40_ENABLED
3218                           __kmp_nested_proc_bind.bind_types[0],
3219 #endif
3220                           &r_icvs,
3221                           0 // argc
3222                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3223                           );
3224   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3225 
3226   root->r.r_hot_team = hot_team;
3227   root_team->t.t_control_stack_top = NULL;
3228 
3229   /* first-time initialization */
3230   hot_team->t.t_parent = root_team;
3231 
3232   /* initialize hot team */
3233   hot_team_max_nth = hot_team->t.t_max_nproc;
3234   for (f = 0; f < hot_team_max_nth; ++f) {
3235     hot_team->t.t_threads[f] = NULL;
3236   }
3237   hot_team->t.t_nproc = 1;
3238   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3239   hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3240   hot_team->t.t_sched.chunk = r_sched.chunk;
3241   hot_team->t.t_size_changed = 0;
3242 }
3243 
3244 #ifdef KMP_DEBUG
3245 
3246 typedef struct kmp_team_list_item {
3247   kmp_team_p const *entry;
3248   struct kmp_team_list_item *next;
3249 } kmp_team_list_item_t;
3250 typedef kmp_team_list_item_t *kmp_team_list_t;
3251 
3252 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3253     kmp_team_list_t list, // List of teams.
3254     kmp_team_p const *team // Team to add.
3255     ) {
3256 
3257   // List must terminate with item where both entry and next are NULL.
3258   // Team is added to the list only once.
3259   // List is sorted in ascending order by team id.
3260   // Team id is *not* a key.
3261 
3262   kmp_team_list_t l;
3263 
3264   KMP_DEBUG_ASSERT(list != NULL);
3265   if (team == NULL) {
3266     return;
3267   }
3268 
3269   __kmp_print_structure_team_accum(list, team->t.t_parent);
3270   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3271 
3272   // Search list for the team.
3273   l = list;
3274   while (l->next != NULL && l->entry != team) {
3275     l = l->next;
3276   }
3277   if (l->next != NULL) {
3278     return; // Team has been added before, exit.
3279   }
3280 
3281   // Team is not found. Search list again for insertion point.
3282   l = list;
3283   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3284     l = l->next;
3285   }
3286 
3287   // Insert team.
3288   {
3289     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3290         sizeof(kmp_team_list_item_t));
3291     *item = *l;
3292     l->entry = team;
3293     l->next = item;
3294   }
3295 }
3296 
3297 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3298 
3299                                        ) {
3300   __kmp_printf("%s", title);
3301   if (team != NULL) {
3302     __kmp_printf("%2x %p\n", team->t.t_id, team);
3303   } else {
3304     __kmp_printf(" - (nil)\n");
3305   }
3306 }
3307 
3308 static void __kmp_print_structure_thread(char const *title,
3309                                          kmp_info_p const *thread) {
3310   __kmp_printf("%s", title);
3311   if (thread != NULL) {
3312     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3313   } else {
3314     __kmp_printf(" - (nil)\n");
3315   }
3316 }
3317 
3318 void __kmp_print_structure(void) {
3319 
3320   kmp_team_list_t list;
3321 
3322   // Initialize list of teams.
3323   list =
3324       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3325   list->entry = NULL;
3326   list->next = NULL;
3327 
3328   __kmp_printf("\n------------------------------\nGlobal Thread "
3329                "Table\n------------------------------\n");
3330   {
3331     int gtid;
3332     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3333       __kmp_printf("%2d", gtid);
3334       if (__kmp_threads != NULL) {
3335         __kmp_printf(" %p", __kmp_threads[gtid]);
3336       }
3337       if (__kmp_root != NULL) {
3338         __kmp_printf(" %p", __kmp_root[gtid]);
3339       }
3340       __kmp_printf("\n");
3341     }
3342   }
3343 
3344   // Print out __kmp_threads array.
3345   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3346                "----------\n");
3347   if (__kmp_threads != NULL) {
3348     int gtid;
3349     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3350       kmp_info_t const *thread = __kmp_threads[gtid];
3351       if (thread != NULL) {
3352         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3353         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3354         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3355         __kmp_print_structure_team("    Serial Team:  ",
3356                                    thread->th.th_serial_team);
3357         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3358         __kmp_print_structure_thread("    Master:       ",
3359                                      thread->th.th_team_master);
3360         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3361         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3362 #if OMP_40_ENABLED
3363         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3364 #endif
3365         __kmp_print_structure_thread("    Next in pool: ",
3366                                      thread->th.th_next_pool);
3367         __kmp_printf("\n");
3368         __kmp_print_structure_team_accum(list, thread->th.th_team);
3369         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3370       }
3371     }
3372   } else {
3373     __kmp_printf("Threads array is not allocated.\n");
3374   }
3375 
3376   // Print out __kmp_root array.
3377   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3378                "--------\n");
3379   if (__kmp_root != NULL) {
3380     int gtid;
3381     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3382       kmp_root_t const *root = __kmp_root[gtid];
3383       if (root != NULL) {
3384         __kmp_printf("GTID %2d %p:\n", gtid, root);
3385         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3386         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3387         __kmp_print_structure_thread("    Uber Thread:  ",
3388                                      root->r.r_uber_thread);
3389         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3390         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3391         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3392         __kmp_printf("\n");
3393         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3394         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3395       }
3396     }
3397   } else {
3398     __kmp_printf("Ubers array is not allocated.\n");
3399   }
3400 
3401   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3402                "--------\n");
3403   while (list->next != NULL) {
3404     kmp_team_p const *team = list->entry;
3405     int i;
3406     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3407     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3408     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3409     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3410     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3411     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3412     for (i = 0; i < team->t.t_nproc; ++i) {
3413       __kmp_printf("    Thread %2d:      ", i);
3414       __kmp_print_structure_thread("", team->t.t_threads[i]);
3415     }
3416     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3417     __kmp_printf("\n");
3418     list = list->next;
3419   }
3420 
3421   // Print out __kmp_thread_pool and __kmp_team_pool.
3422   __kmp_printf("\n------------------------------\nPools\n----------------------"
3423                "--------\n");
3424   __kmp_print_structure_thread("Thread pool:          ",
3425                                CCAST(kmp_info_t *, __kmp_thread_pool));
3426   __kmp_print_structure_team("Team pool:            ",
3427                              CCAST(kmp_team_t *, __kmp_team_pool));
3428   __kmp_printf("\n");
3429 
3430   // Free team list.
3431   while (list != NULL) {
3432     kmp_team_list_item_t *item = list;
3433     list = list->next;
3434     KMP_INTERNAL_FREE(item);
3435   }
3436 }
3437 
3438 #endif
3439 
3440 //---------------------------------------------------------------------------
3441 //  Stuff for per-thread fast random number generator
3442 //  Table of primes
3443 static const unsigned __kmp_primes[] = {
3444     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3445     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3446     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3447     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3448     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3449     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3450     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3451     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3452     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3453     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3454     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3455 
3456 //---------------------------------------------------------------------------
3457 //  __kmp_get_random: Get a random number using a linear congruential method.
3458 unsigned short __kmp_get_random(kmp_info_t *thread) {
3459   unsigned x = thread->th.th_x;
3460   unsigned short r = x >> 16;
3461 
3462   thread->th.th_x = x * thread->th.th_a + 1;
3463 
3464   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3465                 thread->th.th_info.ds.ds_tid, r));
3466 
3467   return r;
3468 }
3469 //--------------------------------------------------------
3470 // __kmp_init_random: Initialize a random number generator
3471 void __kmp_init_random(kmp_info_t *thread) {
3472   unsigned seed = thread->th.th_info.ds.ds_tid;
3473 
3474   thread->th.th_a =
3475       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3476   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3477   KA_TRACE(30,
3478            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3479 }
3480 
3481 #if KMP_OS_WINDOWS
3482 /* reclaim array entries for root threads that are already dead, returns number
3483  * reclaimed */
3484 static int __kmp_reclaim_dead_roots(void) {
3485   int i, r = 0;
3486 
3487   for (i = 0; i < __kmp_threads_capacity; ++i) {
3488     if (KMP_UBER_GTID(i) &&
3489         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3490         !__kmp_root[i]
3491              ->r.r_active) { // AC: reclaim only roots died in non-active state
3492       r += __kmp_unregister_root_other_thread(i);
3493     }
3494   }
3495   return r;
3496 }
3497 #endif
3498 
3499 /* This function attempts to create free entries in __kmp_threads and
3500    __kmp_root, and returns the number of free entries generated.
3501 
3502    For Windows* OS static library, the first mechanism used is to reclaim array
3503    entries for root threads that are already dead.
3504 
3505    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3506    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3507    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3508    threadprivate cache array has been created. Synchronization with
3509    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3510 
3511    After any dead root reclamation, if the clipping value allows array expansion
3512    to result in the generation of a total of nWish free slots, the function does
3513    that expansion. If not, but the clipping value allows array expansion to
3514    result in the generation of a total of nNeed free slots, the function does
3515    that expansion. Otherwise, nothing is done beyond the possible initial root
3516    thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3517    to fulfil nWish as far as possible, i.e. the function will attempt to create
3518    as many free slots as possible up to nWish.
3519 
3520    If any argument is negative, the behavior is undefined. */
3521 static int __kmp_expand_threads(int nWish, int nNeed) {
3522   int added = 0;
3523   int old_tp_cached;
3524   int __kmp_actual_max_nth;
3525 
3526   if (nNeed > nWish) /* normalize the arguments */
3527     nWish = nNeed;
3528 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3529   /* only for Windows static library */
3530   /* reclaim array entries for root threads that are already dead */
3531   added = __kmp_reclaim_dead_roots();
3532 
3533   if (nNeed) {
3534     nNeed -= added;
3535     if (nNeed < 0)
3536       nNeed = 0;
3537   }
3538   if (nWish) {
3539     nWish -= added;
3540     if (nWish < 0)
3541       nWish = 0;
3542   }
3543 #endif
3544   if (nWish <= 0)
3545     return added;
3546 
3547   while (1) {
3548     int nTarget;
3549     int minimumRequiredCapacity;
3550     int newCapacity;
3551     kmp_info_t **newThreads;
3552     kmp_root_t **newRoot;
3553 
3554     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3555     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3556     // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3557     // > __kmp_max_nth in one of two ways:
3558     //
3559     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3560     //    may not be resused by another thread, so we may need to increase
3561     //    __kmp_threads_capacity to __kmp_max_nth + 1.
3562     //
3563     // 2) New foreign root(s) are encountered.  We always register new foreign
3564     //    roots. This may cause a smaller # of threads to be allocated at
3565     //    subsequent parallel regions, but the worker threads hang around (and
3566     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3567     //
3568     // Anyway, that is the reason for moving the check to see if
3569     // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3570     // instead of having it performed here. -BB
3571     old_tp_cached = __kmp_tp_cached;
3572     __kmp_actual_max_nth =
3573         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3574     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3575 
3576     /* compute expansion headroom to check if we can expand and whether to aim
3577        for nWish or nNeed */
3578     nTarget = nWish;
3579     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3580       /* can't fulfil nWish, so try nNeed */
3581       if (nNeed) {
3582         nTarget = nNeed;
3583         if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3584           /* possible expansion too small -- give up */
3585           break;
3586         }
3587       } else {
3588         /* best-effort */
3589         nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3590         if (!nTarget) {
3591           /* can expand at all -- give up */
3592           break;
3593         }
3594       }
3595     }
3596     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3597 
3598     newCapacity = __kmp_threads_capacity;
3599     do {
3600       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3601                         ? (newCapacity << 1)
3602                         : __kmp_actual_max_nth;
3603     } while (newCapacity < minimumRequiredCapacity);
3604     newThreads = (kmp_info_t **)__kmp_allocate(
3605         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3606         CACHE_LINE);
3607     newRoot = (kmp_root_t **)((char *)newThreads +
3608                               sizeof(kmp_info_t *) * newCapacity);
3609     KMP_MEMCPY(newThreads, __kmp_threads,
3610                __kmp_threads_capacity * sizeof(kmp_info_t *));
3611     KMP_MEMCPY(newRoot, __kmp_root,
3612                __kmp_threads_capacity * sizeof(kmp_root_t *));
3613     memset(newThreads + __kmp_threads_capacity, 0,
3614            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3615     memset(newRoot + __kmp_threads_capacity, 0,
3616            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3617 
3618     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3619       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3620          allocated a threadprivate cache while we were allocating the expanded
3621          array, and our new capacity is larger than the threadprivate cache
3622          capacity, so we should deallocate the expanded arrays and try again.
3623          This is the first check of a double-check pair. */
3624       __kmp_free(newThreads);
3625       continue; /* start over and try again */
3626     }
3627     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3628     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3629       /* Same check as above, but this time with the lock so we can be sure if
3630          we can succeed. */
3631       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3632       __kmp_free(newThreads);
3633       continue; /* start over and try again */
3634     } else {
3635       /* success */
3636       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3637       // investigated.
3638       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3639       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3640       added += newCapacity - __kmp_threads_capacity;
3641       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3642       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3643       break; /* succeeded, so we can exit the loop */
3644     }
3645   }
3646   return added;
3647 }
3648 
3649 /* Register the current thread as a root thread and obtain our gtid. We must
3650    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3651    thread that calls from __kmp_do_serial_initialize() */
3652 int __kmp_register_root(int initial_thread) {
3653   kmp_info_t *root_thread;
3654   kmp_root_t *root;
3655   int gtid;
3656   int capacity;
3657   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3658   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3659   KMP_MB();
3660 
3661   /* 2007-03-02:
3662      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3663      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3664      work as expected -- it may return false (that means there is at least one
3665      empty slot in __kmp_threads array), but it is possible the only free slot
3666      is #0, which is reserved for initial thread and so cannot be used for this
3667      one. Following code workarounds this bug.
3668 
3669      However, right solution seems to be not reserving slot #0 for initial
3670      thread because:
3671      (1) there is no magic in slot #0,
3672      (2) we cannot detect initial thread reliably (the first thread which does
3673         serial initialization may be not a real initial thread).
3674   */
3675   capacity = __kmp_threads_capacity;
3676   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3677     --capacity;
3678   }
3679 
3680   /* see if there are too many threads */
3681   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3682     if (__kmp_tp_cached) {
3683       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3684                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3685                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3686     } else {
3687       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3688                   __kmp_msg_null);
3689     }
3690   }
3691 
3692   /* find an available thread slot */
3693   /* Don't reassign the zero slot since we need that to only be used by initial
3694      thread */
3695   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3696        gtid++)
3697     ;
3698   KA_TRACE(1,
3699            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3700   KMP_ASSERT(gtid < __kmp_threads_capacity);
3701 
3702   /* update global accounting */
3703   __kmp_all_nth++;
3704   TCW_4(__kmp_nth, __kmp_nth + 1);
3705 
3706   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3707   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3708   if (__kmp_adjust_gtid_mode) {
3709     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3710       if (TCR_4(__kmp_gtid_mode) != 2) {
3711         TCW_4(__kmp_gtid_mode, 2);
3712       }
3713     } else {
3714       if (TCR_4(__kmp_gtid_mode) != 1) {
3715         TCW_4(__kmp_gtid_mode, 1);
3716       }
3717     }
3718   }
3719 
3720 #ifdef KMP_ADJUST_BLOCKTIME
3721   /* Adjust blocktime to zero if necessary            */
3722   /* Middle initialization might not have occurred yet */
3723   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3724     if (__kmp_nth > __kmp_avail_proc) {
3725       __kmp_zero_bt = TRUE;
3726     }
3727   }
3728 #endif /* KMP_ADJUST_BLOCKTIME */
3729 
3730   /* setup this new hierarchy */
3731   if (!(root = __kmp_root[gtid])) {
3732     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3733     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3734   }
3735 
3736 #if KMP_STATS_ENABLED
3737   // Initialize stats as soon as possible (right after gtid assignment).
3738   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3739   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3740   KMP_SET_THREAD_STATE(SERIAL_REGION);
3741   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3742 #endif
3743   __kmp_initialize_root(root);
3744 
3745   /* setup new root thread structure */
3746   if (root->r.r_uber_thread) {
3747     root_thread = root->r.r_uber_thread;
3748   } else {
3749     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3750     if (__kmp_storage_map) {
3751       __kmp_print_thread_storage_map(root_thread, gtid);
3752     }
3753     root_thread->th.th_info.ds.ds_gtid = gtid;
3754 #if OMPT_SUPPORT
3755     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3756 #endif
3757     root_thread->th.th_root = root;
3758     if (__kmp_env_consistency_check) {
3759       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3760     }
3761 #if USE_FAST_MEMORY
3762     __kmp_initialize_fast_memory(root_thread);
3763 #endif /* USE_FAST_MEMORY */
3764 
3765 #if KMP_USE_BGET
3766     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3767     __kmp_initialize_bget(root_thread);
3768 #endif
3769     __kmp_init_random(root_thread); // Initialize random number generator
3770   }
3771 
3772   /* setup the serial team held in reserve by the root thread */
3773   if (!root_thread->th.th_serial_team) {
3774     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3775     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3776     root_thread->th.th_serial_team =
3777         __kmp_allocate_team(root, 1, 1,
3778 #if OMPT_SUPPORT
3779                             ompt_data_none, // root parallel id
3780 #endif
3781 #if OMP_40_ENABLED
3782                             proc_bind_default,
3783 #endif
3784                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3785   }
3786   KMP_ASSERT(root_thread->th.th_serial_team);
3787   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3788                 root_thread->th.th_serial_team));
3789 
3790   /* drop root_thread into place */
3791   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3792 
3793   root->r.r_root_team->t.t_threads[0] = root_thread;
3794   root->r.r_hot_team->t.t_threads[0] = root_thread;
3795   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3796   // AC: the team created in reserve, not for execution (it is unused for now).
3797   root_thread->th.th_serial_team->t.t_serialized = 0;
3798   root->r.r_uber_thread = root_thread;
3799 
3800   /* initialize the thread, get it ready to go */
3801   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3802   TCW_4(__kmp_init_gtid, TRUE);
3803 
3804   /* prepare the master thread for get_gtid() */
3805   __kmp_gtid_set_specific(gtid);
3806 
3807 #if USE_ITT_BUILD
3808   __kmp_itt_thread_name(gtid);
3809 #endif /* USE_ITT_BUILD */
3810 
3811 #ifdef KMP_TDATA_GTID
3812   __kmp_gtid = gtid;
3813 #endif
3814   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3815   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3816 
3817   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3818                 "plain=%u\n",
3819                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3820                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3821                 KMP_INIT_BARRIER_STATE));
3822   { // Initialize barrier data.
3823     int b;
3824     for (b = 0; b < bs_last_barrier; ++b) {
3825       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3826 #if USE_DEBUGGER
3827       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3828 #endif
3829     }
3830   }
3831   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3832                    KMP_INIT_BARRIER_STATE);
3833 
3834 #if KMP_AFFINITY_SUPPORTED
3835 #if OMP_40_ENABLED
3836   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3837   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3838   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3839   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3840 #endif
3841 
3842   if (TCR_4(__kmp_init_middle)) {
3843     __kmp_affinity_set_init_mask(gtid, TRUE);
3844   }
3845 #endif /* KMP_AFFINITY_SUPPORTED */
3846 
3847   __kmp_root_counter++;
3848 
3849 #if OMPT_SUPPORT
3850   if (!initial_thread && ompt_enabled.enabled) {
3851 
3852     ompt_thread_t *root_thread = ompt_get_thread();
3853 
3854     ompt_set_thread_state(root_thread, omp_state_overhead);
3855 
3856     if (ompt_enabled.ompt_callback_thread_begin) {
3857       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3858           ompt_thread_initial, __ompt_get_thread_data_internal());
3859     }
3860     ompt_data_t *task_data;
3861     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3862     if (ompt_enabled.ompt_callback_task_create) {
3863       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3864           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3865       // initial task has nothing to return to
3866     }
3867 
3868     ompt_set_thread_state(root_thread, omp_state_work_serial);
3869   }
3870 #endif
3871 
3872   KMP_MB();
3873   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3874 
3875   return gtid;
3876 }
3877 
3878 #if KMP_NESTED_HOT_TEAMS
3879 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3880                                 const int max_level) {
3881   int i, n, nth;
3882   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3883   if (!hot_teams || !hot_teams[level].hot_team) {
3884     return 0;
3885   }
3886   KMP_DEBUG_ASSERT(level < max_level);
3887   kmp_team_t *team = hot_teams[level].hot_team;
3888   nth = hot_teams[level].hot_team_nth;
3889   n = nth - 1; // master is not freed
3890   if (level < max_level - 1) {
3891     for (i = 0; i < nth; ++i) {
3892       kmp_info_t *th = team->t.t_threads[i];
3893       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3894       if (i > 0 && th->th.th_hot_teams) {
3895         __kmp_free(th->th.th_hot_teams);
3896         th->th.th_hot_teams = NULL;
3897       }
3898     }
3899   }
3900   __kmp_free_team(root, team, NULL);
3901   return n;
3902 }
3903 #endif
3904 
3905 // Resets a root thread and clear its root and hot teams.
3906 // Returns the number of __kmp_threads entries directly and indirectly freed.
3907 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3908   kmp_team_t *root_team = root->r.r_root_team;
3909   kmp_team_t *hot_team = root->r.r_hot_team;
3910   int n = hot_team->t.t_nproc;
3911   int i;
3912 
3913   KMP_DEBUG_ASSERT(!root->r.r_active);
3914 
3915   root->r.r_root_team = NULL;
3916   root->r.r_hot_team = NULL;
3917   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3918   // before call to __kmp_free_team().
3919   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3920 #if KMP_NESTED_HOT_TEAMS
3921   if (__kmp_hot_teams_max_level >
3922       0) { // need to free nested hot teams and their threads if any
3923     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3924       kmp_info_t *th = hot_team->t.t_threads[i];
3925       if (__kmp_hot_teams_max_level > 1) {
3926         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3927       }
3928       if (th->th.th_hot_teams) {
3929         __kmp_free(th->th.th_hot_teams);
3930         th->th.th_hot_teams = NULL;
3931       }
3932     }
3933   }
3934 #endif
3935   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3936 
3937   // Before we can reap the thread, we need to make certain that all other
3938   // threads in the teams that had this root as ancestor have stopped trying to
3939   // steal tasks.
3940   if (__kmp_tasking_mode != tskm_immediate_exec) {
3941     __kmp_wait_to_unref_task_teams();
3942   }
3943 
3944 #if KMP_OS_WINDOWS
3945   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3946   KA_TRACE(
3947       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3948            "\n",
3949            (LPVOID) & (root->r.r_uber_thread->th),
3950            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3951   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3952 #endif /* KMP_OS_WINDOWS */
3953 
3954 #if OMPT_SUPPORT
3955   if (ompt_enabled.ompt_callback_thread_end) {
3956     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3957         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3958   }
3959 #endif
3960 
3961   TCW_4(__kmp_nth,
3962         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3963   root->r.r_cg_nthreads--;
3964 
3965   __kmp_reap_thread(root->r.r_uber_thread, 1);
3966 
3967   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3968   // of freeing.
3969   root->r.r_uber_thread = NULL;
3970   /* mark root as no longer in use */
3971   root->r.r_begin = FALSE;
3972 
3973   return n;
3974 }
3975 
3976 void __kmp_unregister_root_current_thread(int gtid) {
3977   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3978   /* this lock should be ok, since unregister_root_current_thread is never
3979      called during an abort, only during a normal close. furthermore, if you
3980      have the forkjoin lock, you should never try to get the initz lock */
3981   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3982   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3983     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3984                   "exiting T#%d\n",
3985                   gtid));
3986     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3987     return;
3988   }
3989   kmp_root_t *root = __kmp_root[gtid];
3990 
3991   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3992   KMP_ASSERT(KMP_UBER_GTID(gtid));
3993   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3994   KMP_ASSERT(root->r.r_active == FALSE);
3995 
3996   KMP_MB();
3997 
3998 #if OMP_45_ENABLED
3999   kmp_info_t *thread = __kmp_threads[gtid];
4000   kmp_team_t *team = thread->th.th_team;
4001   kmp_task_team_t *task_team = thread->th.th_task_team;
4002 
4003   // we need to wait for the proxy tasks before finishing the thread
4004   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4005 #if OMPT_SUPPORT
4006     // the runtime is shutting down so we won't report any events
4007     thread->th.ompt_thread_info.state = omp_state_undefined;
4008 #endif
4009     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4010   }
4011 #endif
4012 
4013   __kmp_reset_root(gtid, root);
4014 
4015   /* free up this thread slot */
4016   __kmp_gtid_set_specific(KMP_GTID_DNE);
4017 #ifdef KMP_TDATA_GTID
4018   __kmp_gtid = KMP_GTID_DNE;
4019 #endif
4020 
4021   KMP_MB();
4022   KC_TRACE(10,
4023            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4024 
4025   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4026 }
4027 
4028 #if KMP_OS_WINDOWS
4029 /* __kmp_forkjoin_lock must be already held
4030    Unregisters a root thread that is not the current thread.  Returns the number
4031    of __kmp_threads entries freed as a result. */
4032 static int __kmp_unregister_root_other_thread(int gtid) {
4033   kmp_root_t *root = __kmp_root[gtid];
4034   int r;
4035 
4036   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4037   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4038   KMP_ASSERT(KMP_UBER_GTID(gtid));
4039   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4040   KMP_ASSERT(root->r.r_active == FALSE);
4041 
4042   r = __kmp_reset_root(gtid, root);
4043   KC_TRACE(10,
4044            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4045   return r;
4046 }
4047 #endif
4048 
4049 #if KMP_DEBUG
4050 void __kmp_task_info() {
4051 
4052   kmp_int32 gtid = __kmp_entry_gtid();
4053   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4054   kmp_info_t *this_thr = __kmp_threads[gtid];
4055   kmp_team_t *steam = this_thr->th.th_serial_team;
4056   kmp_team_t *team = this_thr->th.th_team;
4057 
4058   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4059                "ptask=%p\n",
4060                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4061                team->t.t_implicit_task_taskdata[tid].td_parent);
4062 }
4063 #endif // KMP_DEBUG
4064 
4065 /* TODO optimize with one big memclr, take out what isn't needed, split
4066    responsibility to workers as much as possible, and delay initialization of
4067    features as much as possible  */
4068 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4069                                   int tid, int gtid) {
4070   /* this_thr->th.th_info.ds.ds_gtid is setup in
4071      kmp_allocate_thread/create_worker.
4072      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4073   kmp_info_t *master = team->t.t_threads[0];
4074   KMP_DEBUG_ASSERT(this_thr != NULL);
4075   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4076   KMP_DEBUG_ASSERT(team);
4077   KMP_DEBUG_ASSERT(team->t.t_threads);
4078   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4079   KMP_DEBUG_ASSERT(master);
4080   KMP_DEBUG_ASSERT(master->th.th_root);
4081 
4082   KMP_MB();
4083 
4084   TCW_SYNC_PTR(this_thr->th.th_team, team);
4085 
4086   this_thr->th.th_info.ds.ds_tid = tid;
4087   this_thr->th.th_set_nproc = 0;
4088   if (__kmp_tasking_mode != tskm_immediate_exec)
4089     // When tasking is possible, threads are not safe to reap until they are
4090     // done tasking; this will be set when tasking code is exited in wait
4091     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4092   else // no tasking --> always safe to reap
4093     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4094 #if OMP_40_ENABLED
4095   this_thr->th.th_set_proc_bind = proc_bind_default;
4096 #if KMP_AFFINITY_SUPPORTED
4097   this_thr->th.th_new_place = this_thr->th.th_current_place;
4098 #endif
4099 #endif
4100   this_thr->th.th_root = master->th.th_root;
4101 
4102   /* setup the thread's cache of the team structure */
4103   this_thr->th.th_team_nproc = team->t.t_nproc;
4104   this_thr->th.th_team_master = master;
4105   this_thr->th.th_team_serialized = team->t.t_serialized;
4106   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4107 
4108   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4109 
4110   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4111                 tid, gtid, this_thr, this_thr->th.th_current_task));
4112 
4113   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4114                            team, tid, TRUE);
4115 
4116   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4117                 tid, gtid, this_thr, this_thr->th.th_current_task));
4118   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4119   // __kmp_initialize_team()?
4120 
4121   /* TODO no worksharing in speculative threads */
4122   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4123 
4124   this_thr->th.th_local.this_construct = 0;
4125 
4126   if (!this_thr->th.th_pri_common) {
4127     this_thr->th.th_pri_common =
4128         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4129     if (__kmp_storage_map) {
4130       __kmp_print_storage_map_gtid(
4131           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4132           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4133     }
4134     this_thr->th.th_pri_head = NULL;
4135   }
4136 
4137   /* Initialize dynamic dispatch */
4138   {
4139     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4140     // Use team max_nproc since this will never change for the team.
4141     size_t disp_size =
4142         sizeof(dispatch_private_info_t) *
4143         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4144     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4145                   team->t.t_max_nproc));
4146     KMP_ASSERT(dispatch);
4147     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4148     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4149 
4150     dispatch->th_disp_index = 0;
4151 #if OMP_45_ENABLED
4152     dispatch->th_doacross_buf_idx = 0;
4153 #endif
4154     if (!dispatch->th_disp_buffer) {
4155       dispatch->th_disp_buffer =
4156           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4157 
4158       if (__kmp_storage_map) {
4159         __kmp_print_storage_map_gtid(
4160             gtid, &dispatch->th_disp_buffer[0],
4161             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4162                                           ? 1
4163                                           : __kmp_dispatch_num_buffers],
4164             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4165                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4166             gtid, team->t.t_id, gtid);
4167       }
4168     } else {
4169       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4170     }
4171 
4172     dispatch->th_dispatch_pr_current = 0;
4173     dispatch->th_dispatch_sh_current = 0;
4174 
4175     dispatch->th_deo_fcn = 0; /* ORDERED     */
4176     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4177   }
4178 
4179   this_thr->th.th_next_pool = NULL;
4180 
4181   if (!this_thr->th.th_task_state_memo_stack) {
4182     size_t i;
4183     this_thr->th.th_task_state_memo_stack =
4184         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4185     this_thr->th.th_task_state_top = 0;
4186     this_thr->th.th_task_state_stack_sz = 4;
4187     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4188          ++i) // zero init the stack
4189       this_thr->th.th_task_state_memo_stack[i] = 0;
4190   }
4191 
4192   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4193   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4194 
4195   KMP_MB();
4196 }
4197 
4198 /* allocate a new thread for the requesting team. this is only called from
4199    within a forkjoin critical section. we will first try to get an available
4200    thread from the thread pool. if none is available, we will fork a new one
4201    assuming we are able to create a new one. this should be assured, as the
4202    caller should check on this first. */
4203 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4204                                   int new_tid) {
4205   kmp_team_t *serial_team;
4206   kmp_info_t *new_thr;
4207   int new_gtid;
4208 
4209   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4210   KMP_DEBUG_ASSERT(root && team);
4211 #if !KMP_NESTED_HOT_TEAMS
4212   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4213 #endif
4214   KMP_MB();
4215 
4216   /* first, try to get one from the thread pool */
4217   if (__kmp_thread_pool) {
4218 
4219     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4220     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4221     if (new_thr == __kmp_thread_pool_insert_pt) {
4222       __kmp_thread_pool_insert_pt = NULL;
4223     }
4224     TCW_4(new_thr->th.th_in_pool, FALSE);
4225     // Don't touch th_active_in_pool or th_active.
4226     // The worker thread adjusts those flags as it sleeps/awakens.
4227     __kmp_thread_pool_nth--;
4228 
4229     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4230                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4231     KMP_ASSERT(!new_thr->th.th_team);
4232     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4233     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4234 
4235     /* setup the thread structure */
4236     __kmp_initialize_info(new_thr, team, new_tid,
4237                           new_thr->th.th_info.ds.ds_gtid);
4238     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4239 
4240     TCW_4(__kmp_nth, __kmp_nth + 1);
4241     root->r.r_cg_nthreads++;
4242 
4243     new_thr->th.th_task_state = 0;
4244     new_thr->th.th_task_state_top = 0;
4245     new_thr->th.th_task_state_stack_sz = 4;
4246 
4247 #ifdef KMP_ADJUST_BLOCKTIME
4248     /* Adjust blocktime back to zero if necessary */
4249     /* Middle initialization might not have occurred yet */
4250     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4251       if (__kmp_nth > __kmp_avail_proc) {
4252         __kmp_zero_bt = TRUE;
4253       }
4254     }
4255 #endif /* KMP_ADJUST_BLOCKTIME */
4256 
4257 #if KMP_DEBUG
4258     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4259     // KMP_BARRIER_PARENT_FLAG.
4260     int b;
4261     kmp_balign_t *balign = new_thr->th.th_bar;
4262     for (b = 0; b < bs_last_barrier; ++b)
4263       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4264 #endif
4265 
4266     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4267                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4268 
4269     KMP_MB();
4270     return new_thr;
4271   }
4272 
4273   /* no, well fork a new one */
4274   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4275   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4276 
4277 #if KMP_USE_MONITOR
4278   // If this is the first worker thread the RTL is creating, then also
4279   // launch the monitor thread.  We try to do this as early as possible.
4280   if (!TCR_4(__kmp_init_monitor)) {
4281     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4282     if (!TCR_4(__kmp_init_monitor)) {
4283       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4284       TCW_4(__kmp_init_monitor, 1);
4285       __kmp_create_monitor(&__kmp_monitor);
4286       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4287 #if KMP_OS_WINDOWS
4288       // AC: wait until monitor has started. This is a fix for CQ232808.
4289       // The reason is that if the library is loaded/unloaded in a loop with
4290       // small (parallel) work in between, then there is high probability that
4291       // monitor thread started after the library shutdown. At shutdown it is
4292       // too late to cope with the problem, because when the master is in
4293       // DllMain (process detach) the monitor has no chances to start (it is
4294       // blocked), and master has no means to inform the monitor that the
4295       // library has gone, because all the memory which the monitor can access
4296       // is going to be released/reset.
4297       while (TCR_4(__kmp_init_monitor) < 2) {
4298         KMP_YIELD(TRUE);
4299       }
4300       KF_TRACE(10, ("after monitor thread has started\n"));
4301 #endif
4302     }
4303     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4304   }
4305 #endif
4306 
4307   KMP_MB();
4308   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4309     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4310   }
4311 
4312   /* allocate space for it. */
4313   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4314 
4315   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4316 
4317   if (__kmp_storage_map) {
4318     __kmp_print_thread_storage_map(new_thr, new_gtid);
4319   }
4320 
4321   // add the reserve serialized team, initialized from the team's master thread
4322   {
4323     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4324     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4325     new_thr->th.th_serial_team = serial_team =
4326         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4327 #if OMPT_SUPPORT
4328                                           ompt_data_none, // root parallel id
4329 #endif
4330 #if OMP_40_ENABLED
4331                                           proc_bind_default,
4332 #endif
4333                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4334   }
4335   KMP_ASSERT(serial_team);
4336   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4337   // execution (it is unused for now).
4338   serial_team->t.t_threads[0] = new_thr;
4339   KF_TRACE(10,
4340            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4341             new_thr));
4342 
4343   /* setup the thread structures */
4344   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4345 
4346 #if USE_FAST_MEMORY
4347   __kmp_initialize_fast_memory(new_thr);
4348 #endif /* USE_FAST_MEMORY */
4349 
4350 #if KMP_USE_BGET
4351   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4352   __kmp_initialize_bget(new_thr);
4353 #endif
4354 
4355   __kmp_init_random(new_thr); // Initialize random number generator
4356 
4357   /* Initialize these only once when thread is grabbed for a team allocation */
4358   KA_TRACE(20,
4359            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4360             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4361 
4362   int b;
4363   kmp_balign_t *balign = new_thr->th.th_bar;
4364   for (b = 0; b < bs_last_barrier; ++b) {
4365     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4366     balign[b].bb.team = NULL;
4367     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4368     balign[b].bb.use_oncore_barrier = 0;
4369   }
4370 
4371   new_thr->th.th_spin_here = FALSE;
4372   new_thr->th.th_next_waiting = 0;
4373 
4374 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4375   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4376   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4377   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4378   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4379 #endif
4380 
4381   TCW_4(new_thr->th.th_in_pool, FALSE);
4382   new_thr->th.th_active_in_pool = FALSE;
4383   TCW_4(new_thr->th.th_active, TRUE);
4384 
4385   /* adjust the global counters */
4386   __kmp_all_nth++;
4387   __kmp_nth++;
4388 
4389   root->r.r_cg_nthreads++;
4390 
4391   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4392   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4393   if (__kmp_adjust_gtid_mode) {
4394     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4395       if (TCR_4(__kmp_gtid_mode) != 2) {
4396         TCW_4(__kmp_gtid_mode, 2);
4397       }
4398     } else {
4399       if (TCR_4(__kmp_gtid_mode) != 1) {
4400         TCW_4(__kmp_gtid_mode, 1);
4401       }
4402     }
4403   }
4404 
4405 #ifdef KMP_ADJUST_BLOCKTIME
4406   /* Adjust blocktime back to zero if necessary       */
4407   /* Middle initialization might not have occurred yet */
4408   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4409     if (__kmp_nth > __kmp_avail_proc) {
4410       __kmp_zero_bt = TRUE;
4411     }
4412   }
4413 #endif /* KMP_ADJUST_BLOCKTIME */
4414 
4415   /* actually fork it and create the new worker thread */
4416   KF_TRACE(
4417       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4418   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4419   KF_TRACE(10,
4420            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4421 
4422   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4423                 new_gtid));
4424   KMP_MB();
4425   return new_thr;
4426 }
4427 
4428 /* Reinitialize team for reuse.
4429    The hot team code calls this case at every fork barrier, so EPCC barrier
4430    test are extremely sensitive to changes in it, esp. writes to the team
4431    struct, which cause a cache invalidation in all threads.
4432    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4433 static void __kmp_reinitialize_team(kmp_team_t *team,
4434                                     kmp_internal_control_t *new_icvs,
4435                                     ident_t *loc) {
4436   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4437                 team->t.t_threads[0], team));
4438   KMP_DEBUG_ASSERT(team && new_icvs);
4439   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4440   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4441 
4442   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4443   // Copy ICVs to the master thread's implicit taskdata
4444   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4445   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4446 
4447   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4448                 team->t.t_threads[0], team));
4449 }
4450 
4451 /* Initialize the team data structure.
4452    This assumes the t_threads and t_max_nproc are already set.
4453    Also, we don't touch the arguments */
4454 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4455                                   kmp_internal_control_t *new_icvs,
4456                                   ident_t *loc) {
4457   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4458 
4459   /* verify */
4460   KMP_DEBUG_ASSERT(team);
4461   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4462   KMP_DEBUG_ASSERT(team->t.t_threads);
4463   KMP_MB();
4464 
4465   team->t.t_master_tid = 0; /* not needed */
4466   /* team->t.t_master_bar;        not needed */
4467   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4468   team->t.t_nproc = new_nproc;
4469 
4470   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4471   team->t.t_next_pool = NULL;
4472   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4473    * up hot team */
4474 
4475   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4476   team->t.t_invoke = NULL; /* not needed */
4477 
4478   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4479   team->t.t_sched = new_icvs->sched;
4480 
4481 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4482   team->t.t_fp_control_saved = FALSE; /* not needed */
4483   team->t.t_x87_fpu_control_word = 0; /* not needed */
4484   team->t.t_mxcsr = 0; /* not needed */
4485 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4486 
4487   team->t.t_construct = 0;
4488 
4489   team->t.t_ordered.dt.t_value = 0;
4490   team->t.t_master_active = FALSE;
4491 
4492   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4493 
4494 #ifdef KMP_DEBUG
4495   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4496 #endif
4497   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4498 
4499   team->t.t_control_stack_top = NULL;
4500 
4501   __kmp_reinitialize_team(team, new_icvs, loc);
4502 
4503   KMP_MB();
4504   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4505 }
4506 
4507 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4508 /* Sets full mask for thread and returns old mask, no changes to structures. */
4509 static void
4510 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4511   if (KMP_AFFINITY_CAPABLE()) {
4512     int status;
4513     if (old_mask != NULL) {
4514       status = __kmp_get_system_affinity(old_mask, TRUE);
4515       int error = errno;
4516       if (status != 0) {
4517         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4518                     __kmp_msg_null);
4519       }
4520     }
4521     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4522   }
4523 }
4524 #endif
4525 
4526 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4527 
4528 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4529 // It calculats the worker + master thread's partition based upon the parent
4530 // thread's partition, and binds each worker to a thread in their partition.
4531 // The master thread's partition should already include its current binding.
4532 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4533   // Copy the master thread's place partion to the team struct
4534   kmp_info_t *master_th = team->t.t_threads[0];
4535   KMP_DEBUG_ASSERT(master_th != NULL);
4536   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4537   int first_place = master_th->th.th_first_place;
4538   int last_place = master_th->th.th_last_place;
4539   int masters_place = master_th->th.th_current_place;
4540   team->t.t_first_place = first_place;
4541   team->t.t_last_place = last_place;
4542 
4543   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4544                 "bound to place %d partition = [%d,%d]\n",
4545                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4546                 team->t.t_id, masters_place, first_place, last_place));
4547 
4548   switch (proc_bind) {
4549 
4550   case proc_bind_default:
4551     // serial teams might have the proc_bind policy set to proc_bind_default. It
4552     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4553     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4554     break;
4555 
4556   case proc_bind_master: {
4557     int f;
4558     int n_th = team->t.t_nproc;
4559     for (f = 1; f < n_th; f++) {
4560       kmp_info_t *th = team->t.t_threads[f];
4561       KMP_DEBUG_ASSERT(th != NULL);
4562       th->th.th_first_place = first_place;
4563       th->th.th_last_place = last_place;
4564       th->th.th_new_place = masters_place;
4565 
4566       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4567                      "partition = [%d,%d]\n",
4568                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4569                      f, masters_place, first_place, last_place));
4570     }
4571   } break;
4572 
4573   case proc_bind_close: {
4574     int f;
4575     int n_th = team->t.t_nproc;
4576     int n_places;
4577     if (first_place <= last_place) {
4578       n_places = last_place - first_place + 1;
4579     } else {
4580       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4581     }
4582     if (n_th <= n_places) {
4583       int place = masters_place;
4584       for (f = 1; f < n_th; f++) {
4585         kmp_info_t *th = team->t.t_threads[f];
4586         KMP_DEBUG_ASSERT(th != NULL);
4587 
4588         if (place == last_place) {
4589           place = first_place;
4590         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4591           place = 0;
4592         } else {
4593           place++;
4594         }
4595         th->th.th_first_place = first_place;
4596         th->th.th_last_place = last_place;
4597         th->th.th_new_place = place;
4598 
4599         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4600                        "partition = [%d,%d]\n",
4601                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4602                        team->t.t_id, f, place, first_place, last_place));
4603       }
4604     } else {
4605       int S, rem, gap, s_count;
4606       S = n_th / n_places;
4607       s_count = 0;
4608       rem = n_th - (S * n_places);
4609       gap = rem > 0 ? n_places / rem : n_places;
4610       int place = masters_place;
4611       int gap_ct = gap;
4612       for (f = 0; f < n_th; f++) {
4613         kmp_info_t *th = team->t.t_threads[f];
4614         KMP_DEBUG_ASSERT(th != NULL);
4615 
4616         th->th.th_first_place = first_place;
4617         th->th.th_last_place = last_place;
4618         th->th.th_new_place = place;
4619         s_count++;
4620 
4621         if ((s_count == S) && rem && (gap_ct == gap)) {
4622           // do nothing, add an extra thread to place on next iteration
4623         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4624           // we added an extra thread to this place; move to next place
4625           if (place == last_place) {
4626             place = first_place;
4627           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4628             place = 0;
4629           } else {
4630             place++;
4631           }
4632           s_count = 0;
4633           gap_ct = 1;
4634           rem--;
4635         } else if (s_count == S) { // place full; don't add extra
4636           if (place == last_place) {
4637             place = first_place;
4638           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4639             place = 0;
4640           } else {
4641             place++;
4642           }
4643           gap_ct++;
4644           s_count = 0;
4645         }
4646 
4647         KA_TRACE(100,
4648                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4649                   "partition = [%d,%d]\n",
4650                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4651                   th->th.th_new_place, first_place, last_place));
4652       }
4653       KMP_DEBUG_ASSERT(place == masters_place);
4654     }
4655   } break;
4656 
4657   case proc_bind_spread: {
4658     int f;
4659     int n_th = team->t.t_nproc;
4660     int n_places;
4661     int thidx;
4662     if (first_place <= last_place) {
4663       n_places = last_place - first_place + 1;
4664     } else {
4665       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4666     }
4667     if (n_th <= n_places) {
4668       int place = -1;
4669 
4670       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4671         int S = n_places / n_th;
4672         int s_count, rem, gap, gap_ct;
4673 
4674         place = masters_place;
4675         rem = n_places - n_th * S;
4676         gap = rem ? n_th / rem : 1;
4677         gap_ct = gap;
4678         thidx = n_th;
4679         if (update_master_only == 1)
4680           thidx = 1;
4681         for (f = 0; f < thidx; f++) {
4682           kmp_info_t *th = team->t.t_threads[f];
4683           KMP_DEBUG_ASSERT(th != NULL);
4684 
4685           th->th.th_first_place = place;
4686           th->th.th_new_place = place;
4687           s_count = 1;
4688           while (s_count < S) {
4689             if (place == last_place) {
4690               place = first_place;
4691             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4692               place = 0;
4693             } else {
4694               place++;
4695             }
4696             s_count++;
4697           }
4698           if (rem && (gap_ct == gap)) {
4699             if (place == last_place) {
4700               place = first_place;
4701             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4702               place = 0;
4703             } else {
4704               place++;
4705             }
4706             rem--;
4707             gap_ct = 0;
4708           }
4709           th->th.th_last_place = place;
4710           gap_ct++;
4711 
4712           if (place == last_place) {
4713             place = first_place;
4714           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4715             place = 0;
4716           } else {
4717             place++;
4718           }
4719 
4720           KA_TRACE(100,
4721                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4722                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4723                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4724                     f, th->th.th_new_place, th->th.th_first_place,
4725                     th->th.th_last_place, __kmp_affinity_num_masks));
4726         }
4727       } else {
4728         /* Having uniform space of available computation places I can create
4729            T partitions of round(P/T) size and put threads into the first
4730            place of each partition. */
4731         double current = static_cast<double>(masters_place);
4732         double spacing =
4733             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4734         int first, last;
4735         kmp_info_t *th;
4736 
4737         thidx = n_th + 1;
4738         if (update_master_only == 1)
4739           thidx = 1;
4740         for (f = 0; f < thidx; f++) {
4741           first = static_cast<int>(current);
4742           last = static_cast<int>(current + spacing) - 1;
4743           KMP_DEBUG_ASSERT(last >= first);
4744           if (first >= n_places) {
4745             if (masters_place) {
4746               first -= n_places;
4747               last -= n_places;
4748               if (first == (masters_place + 1)) {
4749                 KMP_DEBUG_ASSERT(f == n_th);
4750                 first--;
4751               }
4752               if (last == masters_place) {
4753                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4754                 last--;
4755               }
4756             } else {
4757               KMP_DEBUG_ASSERT(f == n_th);
4758               first = 0;
4759               last = 0;
4760             }
4761           }
4762           if (last >= n_places) {
4763             last = (n_places - 1);
4764           }
4765           place = first;
4766           current += spacing;
4767           if (f < n_th) {
4768             KMP_DEBUG_ASSERT(0 <= first);
4769             KMP_DEBUG_ASSERT(n_places > first);
4770             KMP_DEBUG_ASSERT(0 <= last);
4771             KMP_DEBUG_ASSERT(n_places > last);
4772             KMP_DEBUG_ASSERT(last_place >= first_place);
4773             th = team->t.t_threads[f];
4774             KMP_DEBUG_ASSERT(th);
4775             th->th.th_first_place = first;
4776             th->th.th_new_place = place;
4777             th->th.th_last_place = last;
4778 
4779             KA_TRACE(100,
4780                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4781                       "partition = [%d,%d], spacing = %.4f\n",
4782                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4783                       team->t.t_id, f, th->th.th_new_place,
4784                       th->th.th_first_place, th->th.th_last_place, spacing));
4785           }
4786         }
4787       }
4788       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4789     } else {
4790       int S, rem, gap, s_count;
4791       S = n_th / n_places;
4792       s_count = 0;
4793       rem = n_th - (S * n_places);
4794       gap = rem > 0 ? n_places / rem : n_places;
4795       int place = masters_place;
4796       int gap_ct = gap;
4797       thidx = n_th;
4798       if (update_master_only == 1)
4799         thidx = 1;
4800       for (f = 0; f < thidx; f++) {
4801         kmp_info_t *th = team->t.t_threads[f];
4802         KMP_DEBUG_ASSERT(th != NULL);
4803 
4804         th->th.th_first_place = place;
4805         th->th.th_last_place = place;
4806         th->th.th_new_place = place;
4807         s_count++;
4808 
4809         if ((s_count == S) && rem && (gap_ct == gap)) {
4810           // do nothing, add an extra thread to place on next iteration
4811         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4812           // we added an extra thread to this place; move on to next place
4813           if (place == last_place) {
4814             place = first_place;
4815           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4816             place = 0;
4817           } else {
4818             place++;
4819           }
4820           s_count = 0;
4821           gap_ct = 1;
4822           rem--;
4823         } else if (s_count == S) { // place is full; don't add extra thread
4824           if (place == last_place) {
4825             place = first_place;
4826           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4827             place = 0;
4828           } else {
4829             place++;
4830           }
4831           gap_ct++;
4832           s_count = 0;
4833         }
4834 
4835         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4836                        "partition = [%d,%d]\n",
4837                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4838                        team->t.t_id, f, th->th.th_new_place,
4839                        th->th.th_first_place, th->th.th_last_place));
4840       }
4841       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4842     }
4843   } break;
4844 
4845   default:
4846     break;
4847   }
4848 
4849   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4850 }
4851 
4852 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4853 
4854 /* allocate a new team data structure to use.  take one off of the free pool if
4855    available */
4856 kmp_team_t *
4857 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4858 #if OMPT_SUPPORT
4859                     ompt_data_t ompt_parallel_data,
4860 #endif
4861 #if OMP_40_ENABLED
4862                     kmp_proc_bind_t new_proc_bind,
4863 #endif
4864                     kmp_internal_control_t *new_icvs,
4865                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4866   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4867   int f;
4868   kmp_team_t *team;
4869   int use_hot_team = !root->r.r_active;
4870   int level = 0;
4871 
4872   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4873   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4874   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4875   KMP_MB();
4876 
4877 #if KMP_NESTED_HOT_TEAMS
4878   kmp_hot_team_ptr_t *hot_teams;
4879   if (master) {
4880     team = master->th.th_team;
4881     level = team->t.t_active_level;
4882     if (master->th.th_teams_microtask) { // in teams construct?
4883       if (master->th.th_teams_size.nteams > 1 &&
4884           ( // #teams > 1
4885               team->t.t_pkfn ==
4886                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4887               master->th.th_teams_level <
4888                   team->t.t_level)) { // or nested parallel inside the teams
4889         ++level; // not increment if #teams==1, or for outer fork of the teams;
4890         // increment otherwise
4891       }
4892     }
4893     hot_teams = master->th.th_hot_teams;
4894     if (level < __kmp_hot_teams_max_level && hot_teams &&
4895         hot_teams[level]
4896             .hot_team) { // hot team has already been allocated for given level
4897       use_hot_team = 1;
4898     } else {
4899       use_hot_team = 0;
4900     }
4901   }
4902 #endif
4903   // Optimization to use a "hot" team
4904   if (use_hot_team && new_nproc > 1) {
4905     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4906 #if KMP_NESTED_HOT_TEAMS
4907     team = hot_teams[level].hot_team;
4908 #else
4909     team = root->r.r_hot_team;
4910 #endif
4911 #if KMP_DEBUG
4912     if (__kmp_tasking_mode != tskm_immediate_exec) {
4913       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4914                     "task_team[1] = %p before reinit\n",
4915                     team->t.t_task_team[0], team->t.t_task_team[1]));
4916     }
4917 #endif
4918 
4919     // Has the number of threads changed?
4920     /* Let's assume the most common case is that the number of threads is
4921        unchanged, and put that case first. */
4922     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4923       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4924       // This case can mean that omp_set_num_threads() was called and the hot
4925       // team size was already reduced, so we check the special flag
4926       if (team->t.t_size_changed == -1) {
4927         team->t.t_size_changed = 1;
4928       } else {
4929         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4930       }
4931 
4932       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4933       kmp_r_sched_t new_sched = new_icvs->sched;
4934       if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4935           team->t.t_sched.chunk != new_sched.chunk)
4936         team->t.t_sched =
4937             new_sched; // set master's schedule as new run-time schedule
4938 
4939       __kmp_reinitialize_team(team, new_icvs,
4940                               root->r.r_uber_thread->th.th_ident);
4941 
4942       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4943                     team->t.t_threads[0], team));
4944       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4945 
4946 #if OMP_40_ENABLED
4947 #if KMP_AFFINITY_SUPPORTED
4948       if ((team->t.t_size_changed == 0) &&
4949           (team->t.t_proc_bind == new_proc_bind)) {
4950         if (new_proc_bind == proc_bind_spread) {
4951           __kmp_partition_places(
4952               team, 1); // add flag to update only master for spread
4953         }
4954         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4955                        "proc_bind = %d, partition = [%d,%d]\n",
4956                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4957                        team->t.t_last_place));
4958       } else {
4959         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4960         __kmp_partition_places(team);
4961       }
4962 #else
4963       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4964 #endif /* KMP_AFFINITY_SUPPORTED */
4965 #endif /* OMP_40_ENABLED */
4966     } else if (team->t.t_nproc > new_nproc) {
4967       KA_TRACE(20,
4968                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4969                 new_nproc));
4970 
4971       team->t.t_size_changed = 1;
4972 #if KMP_NESTED_HOT_TEAMS
4973       if (__kmp_hot_teams_mode == 0) {
4974         // AC: saved number of threads should correspond to team's value in this
4975         // mode, can be bigger in mode 1, when hot team has threads in reserve
4976         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4977         hot_teams[level].hot_team_nth = new_nproc;
4978 #endif // KMP_NESTED_HOT_TEAMS
4979         /* release the extra threads we don't need any more */
4980         for (f = new_nproc; f < team->t.t_nproc; f++) {
4981           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4982           if (__kmp_tasking_mode != tskm_immediate_exec) {
4983             // When decreasing team size, threads no longer in the team should
4984             // unref task team.
4985             team->t.t_threads[f]->th.th_task_team = NULL;
4986           }
4987           __kmp_free_thread(team->t.t_threads[f]);
4988           team->t.t_threads[f] = NULL;
4989         }
4990 #if KMP_NESTED_HOT_TEAMS
4991       } // (__kmp_hot_teams_mode == 0)
4992       else {
4993         // When keeping extra threads in team, switch threads to wait on own
4994         // b_go flag
4995         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4996           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4997           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4998           for (int b = 0; b < bs_last_barrier; ++b) {
4999             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5000               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5001             }
5002             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5003           }
5004         }
5005       }
5006 #endif // KMP_NESTED_HOT_TEAMS
5007       team->t.t_nproc = new_nproc;
5008       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5009       if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
5010           team->t.t_sched.chunk != new_icvs->sched.chunk)
5011         team->t.t_sched = new_icvs->sched;
5012       __kmp_reinitialize_team(team, new_icvs,
5013                               root->r.r_uber_thread->th.th_ident);
5014 
5015       /* update the remaining threads */
5016       for (f = 0; f < new_nproc; ++f) {
5017         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5018       }
5019       // restore the current task state of the master thread: should be the
5020       // implicit task
5021       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5022                     team->t.t_threads[0], team));
5023 
5024       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5025 
5026 #ifdef KMP_DEBUG
5027       for (f = 0; f < team->t.t_nproc; f++) {
5028         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5029                          team->t.t_threads[f]->th.th_team_nproc ==
5030                              team->t.t_nproc);
5031       }
5032 #endif
5033 
5034 #if OMP_40_ENABLED
5035       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5036 #if KMP_AFFINITY_SUPPORTED
5037       __kmp_partition_places(team);
5038 #endif
5039 #endif
5040     } else { // team->t.t_nproc < new_nproc
5041 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5042       kmp_affin_mask_t *old_mask;
5043       if (KMP_AFFINITY_CAPABLE()) {
5044         KMP_CPU_ALLOC(old_mask);
5045       }
5046 #endif
5047 
5048       KA_TRACE(20,
5049                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5050                 new_nproc));
5051 
5052       team->t.t_size_changed = 1;
5053 
5054 #if KMP_NESTED_HOT_TEAMS
5055       int avail_threads = hot_teams[level].hot_team_nth;
5056       if (new_nproc < avail_threads)
5057         avail_threads = new_nproc;
5058       kmp_info_t **other_threads = team->t.t_threads;
5059       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5060         // Adjust barrier data of reserved threads (if any) of the team
5061         // Other data will be set in __kmp_initialize_info() below.
5062         int b;
5063         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5064         for (b = 0; b < bs_last_barrier; ++b) {
5065           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5066           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5067 #if USE_DEBUGGER
5068           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5069 #endif
5070         }
5071       }
5072       if (hot_teams[level].hot_team_nth >= new_nproc) {
5073         // we have all needed threads in reserve, no need to allocate any
5074         // this only possible in mode 1, cannot have reserved threads in mode 0
5075         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5076         team->t.t_nproc = new_nproc; // just get reserved threads involved
5077       } else {
5078         // we may have some threads in reserve, but not enough
5079         team->t.t_nproc =
5080             hot_teams[level]
5081                 .hot_team_nth; // get reserved threads involved if any
5082         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5083 #endif // KMP_NESTED_HOT_TEAMS
5084         if (team->t.t_max_nproc < new_nproc) {
5085           /* reallocate larger arrays */
5086           __kmp_reallocate_team_arrays(team, new_nproc);
5087           __kmp_reinitialize_team(team, new_icvs, NULL);
5088         }
5089 
5090 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5091         /* Temporarily set full mask for master thread before creation of
5092            workers. The reason is that workers inherit the affinity from master,
5093            so if a lot of workers are created on the single core quickly, they
5094            don't get a chance to set their own affinity for a long time. */
5095         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5096 #endif
5097 
5098         /* allocate new threads for the hot team */
5099         for (f = team->t.t_nproc; f < new_nproc; f++) {
5100           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5101           KMP_DEBUG_ASSERT(new_worker);
5102           team->t.t_threads[f] = new_worker;
5103 
5104           KA_TRACE(20,
5105                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5106                     "join=%llu, plain=%llu\n",
5107                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5108                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5109                     team->t.t_bar[bs_plain_barrier].b_arrived));
5110 
5111           { // Initialize barrier data for new threads.
5112             int b;
5113             kmp_balign_t *balign = new_worker->th.th_bar;
5114             for (b = 0; b < bs_last_barrier; ++b) {
5115               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5116               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5117                                KMP_BARRIER_PARENT_FLAG);
5118 #if USE_DEBUGGER
5119               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5120 #endif
5121             }
5122           }
5123         }
5124 
5125 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5126         if (KMP_AFFINITY_CAPABLE()) {
5127           /* Restore initial master thread's affinity mask */
5128           __kmp_set_system_affinity(old_mask, TRUE);
5129           KMP_CPU_FREE(old_mask);
5130         }
5131 #endif
5132 #if KMP_NESTED_HOT_TEAMS
5133       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5134 #endif // KMP_NESTED_HOT_TEAMS
5135       /* make sure everyone is syncronized */
5136       int old_nproc = team->t.t_nproc; // save old value and use to update only
5137       // new threads below
5138       __kmp_initialize_team(team, new_nproc, new_icvs,
5139                             root->r.r_uber_thread->th.th_ident);
5140 
5141       /* reinitialize the threads */
5142       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5143       for (f = 0; f < team->t.t_nproc; ++f)
5144         __kmp_initialize_info(team->t.t_threads[f], team, f,
5145                               __kmp_gtid_from_tid(f, team));
5146       if (level) { // set th_task_state for new threads in nested hot team
5147         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5148         // only need to set the th_task_state for the new threads. th_task_state
5149         // for master thread will not be accurate until after this in
5150         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5151         // correct value.
5152         for (f = old_nproc; f < team->t.t_nproc; ++f)
5153           team->t.t_threads[f]->th.th_task_state =
5154               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5155       } else { // set th_task_state for new threads in non-nested hot team
5156         int old_state =
5157             team->t.t_threads[0]->th.th_task_state; // copy master's state
5158         for (f = old_nproc; f < team->t.t_nproc; ++f)
5159           team->t.t_threads[f]->th.th_task_state = old_state;
5160       }
5161 
5162 #ifdef KMP_DEBUG
5163       for (f = 0; f < team->t.t_nproc; ++f) {
5164         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5165                          team->t.t_threads[f]->th.th_team_nproc ==
5166                              team->t.t_nproc);
5167       }
5168 #endif
5169 
5170 #if OMP_40_ENABLED
5171       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5172 #if KMP_AFFINITY_SUPPORTED
5173       __kmp_partition_places(team);
5174 #endif
5175 #endif
5176     } // Check changes in number of threads
5177 
5178 #if OMP_40_ENABLED
5179     kmp_info_t *master = team->t.t_threads[0];
5180     if (master->th.th_teams_microtask) {
5181       for (f = 1; f < new_nproc; ++f) {
5182         // propagate teams construct specific info to workers
5183         kmp_info_t *thr = team->t.t_threads[f];
5184         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5185         thr->th.th_teams_level = master->th.th_teams_level;
5186         thr->th.th_teams_size = master->th.th_teams_size;
5187       }
5188     }
5189 #endif /* OMP_40_ENABLED */
5190 #if KMP_NESTED_HOT_TEAMS
5191     if (level) {
5192       // Sync barrier state for nested hot teams, not needed for outermost hot
5193       // team.
5194       for (f = 1; f < new_nproc; ++f) {
5195         kmp_info_t *thr = team->t.t_threads[f];
5196         int b;
5197         kmp_balign_t *balign = thr->th.th_bar;
5198         for (b = 0; b < bs_last_barrier; ++b) {
5199           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5200           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5201 #if USE_DEBUGGER
5202           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5203 #endif
5204         }
5205       }
5206     }
5207 #endif // KMP_NESTED_HOT_TEAMS
5208 
5209     /* reallocate space for arguments if necessary */
5210     __kmp_alloc_argv_entries(argc, team, TRUE);
5211     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5212     // The hot team re-uses the previous task team,
5213     // if untouched during the previous release->gather phase.
5214 
5215     KF_TRACE(10, (" hot_team = %p\n", team));
5216 
5217 #if KMP_DEBUG
5218     if (__kmp_tasking_mode != tskm_immediate_exec) {
5219       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5220                     "task_team[1] = %p after reinit\n",
5221                     team->t.t_task_team[0], team->t.t_task_team[1]));
5222     }
5223 #endif
5224 
5225 #if OMPT_SUPPORT
5226     __ompt_team_assign_id(team, ompt_parallel_data);
5227 #endif
5228 
5229     KMP_MB();
5230 
5231     return team;
5232   }
5233 
5234   /* next, let's try to take one from the team pool */
5235   KMP_MB();
5236   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5237     /* TODO: consider resizing undersized teams instead of reaping them, now
5238        that we have a resizing mechanism */
5239     if (team->t.t_max_nproc >= max_nproc) {
5240       /* take this team from the team pool */
5241       __kmp_team_pool = team->t.t_next_pool;
5242 
5243       /* setup the team for fresh use */
5244       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5245 
5246       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5247                     "task_team[1] %p to NULL\n",
5248                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5249       team->t.t_task_team[0] = NULL;
5250       team->t.t_task_team[1] = NULL;
5251 
5252       /* reallocate space for arguments if necessary */
5253       __kmp_alloc_argv_entries(argc, team, TRUE);
5254       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5255 
5256       KA_TRACE(
5257           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5258                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5259       { // Initialize barrier data.
5260         int b;
5261         for (b = 0; b < bs_last_barrier; ++b) {
5262           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5263 #if USE_DEBUGGER
5264           team->t.t_bar[b].b_master_arrived = 0;
5265           team->t.t_bar[b].b_team_arrived = 0;
5266 #endif
5267         }
5268       }
5269 
5270 #if OMP_40_ENABLED
5271       team->t.t_proc_bind = new_proc_bind;
5272 #endif
5273 
5274       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5275                     team->t.t_id));
5276 
5277 #if OMPT_SUPPORT
5278       __ompt_team_assign_id(team, ompt_parallel_data);
5279 #endif
5280 
5281       KMP_MB();
5282 
5283       return team;
5284     }
5285 
5286     /* reap team if it is too small, then loop back and check the next one */
5287     // not sure if this is wise, but, will be redone during the hot-teams
5288     // rewrite.
5289     /* TODO: Use technique to find the right size hot-team, don't reap them */
5290     team = __kmp_reap_team(team);
5291     __kmp_team_pool = team;
5292   }
5293 
5294   /* nothing available in the pool, no matter, make a new team! */
5295   KMP_MB();
5296   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5297 
5298   /* and set it up */
5299   team->t.t_max_nproc = max_nproc;
5300   /* NOTE well, for some reason allocating one big buffer and dividing it up
5301      seems to really hurt performance a lot on the P4, so, let's not use this */
5302   __kmp_allocate_team_arrays(team, max_nproc);
5303 
5304   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5305   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5306 
5307   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5308                 "%p to NULL\n",
5309                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5310   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5311   // memory, no need to duplicate
5312   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5313   // memory, no need to duplicate
5314 
5315   if (__kmp_storage_map) {
5316     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5317   }
5318 
5319   /* allocate space for arguments */
5320   __kmp_alloc_argv_entries(argc, team, FALSE);
5321   team->t.t_argc = argc;
5322 
5323   KA_TRACE(20,
5324            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5325             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5326   { // Initialize barrier data.
5327     int b;
5328     for (b = 0; b < bs_last_barrier; ++b) {
5329       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5330 #if USE_DEBUGGER
5331       team->t.t_bar[b].b_master_arrived = 0;
5332       team->t.t_bar[b].b_team_arrived = 0;
5333 #endif
5334     }
5335   }
5336 
5337 #if OMP_40_ENABLED
5338   team->t.t_proc_bind = new_proc_bind;
5339 #endif
5340 
5341 #if OMPT_SUPPORT
5342   __ompt_team_assign_id(team, ompt_parallel_data);
5343   team->t.ompt_serialized_team_info = NULL;
5344 #endif
5345 
5346   KMP_MB();
5347 
5348   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5349                 team->t.t_id));
5350 
5351   return team;
5352 }
5353 
5354 /* TODO implement hot-teams at all levels */
5355 /* TODO implement lazy thread release on demand (disband request) */
5356 
5357 /* free the team.  return it to the team pool.  release all the threads
5358  * associated with it */
5359 void __kmp_free_team(kmp_root_t *root,
5360                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5361   int f;
5362   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5363                 team->t.t_id));
5364 
5365   /* verify state */
5366   KMP_DEBUG_ASSERT(root);
5367   KMP_DEBUG_ASSERT(team);
5368   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5369   KMP_DEBUG_ASSERT(team->t.t_threads);
5370 
5371   int use_hot_team = team == root->r.r_hot_team;
5372 #if KMP_NESTED_HOT_TEAMS
5373   int level;
5374   kmp_hot_team_ptr_t *hot_teams;
5375   if (master) {
5376     level = team->t.t_active_level - 1;
5377     if (master->th.th_teams_microtask) { // in teams construct?
5378       if (master->th.th_teams_size.nteams > 1) {
5379         ++level; // level was not increased in teams construct for
5380         // team_of_masters
5381       }
5382       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5383           master->th.th_teams_level == team->t.t_level) {
5384         ++level; // level was not increased in teams construct for
5385         // team_of_workers before the parallel
5386       } // team->t.t_level will be increased inside parallel
5387     }
5388     hot_teams = master->th.th_hot_teams;
5389     if (level < __kmp_hot_teams_max_level) {
5390       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5391       use_hot_team = 1;
5392     }
5393   }
5394 #endif // KMP_NESTED_HOT_TEAMS
5395 
5396   /* team is done working */
5397   TCW_SYNC_PTR(team->t.t_pkfn,
5398                NULL); // Important for Debugging Support Library.
5399   team->t.t_copyin_counter = 0; // init counter for possible reuse
5400   // Do not reset pointer to parent team to NULL for hot teams.
5401 
5402   /* if we are non-hot team, release our threads */
5403   if (!use_hot_team) {
5404     if (__kmp_tasking_mode != tskm_immediate_exec) {
5405       // Wait for threads to reach reapable state
5406       for (f = 1; f < team->t.t_nproc; ++f) {
5407         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5408         kmp_info_t *th = team->t.t_threads[f];
5409         volatile kmp_uint32 *state = &th->th.th_reap_state;
5410         while (*state != KMP_SAFE_TO_REAP) {
5411 #if KMP_OS_WINDOWS
5412           // On Windows a thread can be killed at any time, check this
5413           DWORD ecode;
5414           if (!__kmp_is_thread_alive(th, &ecode)) {
5415             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5416             break;
5417           }
5418 #endif
5419           // first check if thread is sleeping
5420           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5421           if (fl.is_sleeping())
5422             fl.resume(__kmp_gtid_from_thread(th));
5423           KMP_CPU_PAUSE();
5424         }
5425       }
5426 
5427       // Delete task teams
5428       int tt_idx;
5429       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5430         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5431         if (task_team != NULL) {
5432           for (f = 0; f < team->t.t_nproc;
5433                ++f) { // Have all threads unref task teams
5434             team->t.t_threads[f]->th.th_task_team = NULL;
5435           }
5436           KA_TRACE(
5437               20,
5438               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5439                __kmp_get_gtid(), task_team, team->t.t_id));
5440 #if KMP_NESTED_HOT_TEAMS
5441           __kmp_free_task_team(master, task_team);
5442 #endif
5443           team->t.t_task_team[tt_idx] = NULL;
5444         }
5445       }
5446     }
5447 
5448     // Reset pointer to parent team only for non-hot teams.
5449     team->t.t_parent = NULL;
5450     team->t.t_level = 0;
5451     team->t.t_active_level = 0;
5452 
5453     /* free the worker threads */
5454     for (f = 1; f < team->t.t_nproc; ++f) {
5455       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5456       __kmp_free_thread(team->t.t_threads[f]);
5457       team->t.t_threads[f] = NULL;
5458     }
5459 
5460     /* put the team back in the team pool */
5461     /* TODO limit size of team pool, call reap_team if pool too large */
5462     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5463     __kmp_team_pool = (volatile kmp_team_t *)team;
5464   }
5465 
5466   KMP_MB();
5467 }
5468 
5469 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5470 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5471   kmp_team_t *next_pool = team->t.t_next_pool;
5472 
5473   KMP_DEBUG_ASSERT(team);
5474   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5475   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5476   KMP_DEBUG_ASSERT(team->t.t_threads);
5477   KMP_DEBUG_ASSERT(team->t.t_argv);
5478 
5479   /* TODO clean the threads that are a part of this? */
5480 
5481   /* free stuff */
5482   __kmp_free_team_arrays(team);
5483   if (team->t.t_argv != &team->t.t_inline_argv[0])
5484     __kmp_free((void *)team->t.t_argv);
5485   __kmp_free(team);
5486 
5487   KMP_MB();
5488   return next_pool;
5489 }
5490 
5491 // Free the thread.  Don't reap it, just place it on the pool of available
5492 // threads.
5493 //
5494 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5495 // binding for the affinity mechanism to be useful.
5496 //
5497 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5498 // However, we want to avoid a potential performance problem by always
5499 // scanning through the list to find the correct point at which to insert
5500 // the thread (potential N**2 behavior).  To do this we keep track of the
5501 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5502 // With single-level parallelism, threads will always be added to the tail
5503 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5504 // parallelism, all bets are off and we may need to scan through the entire
5505 // free list.
5506 //
5507 // This change also has a potentially large performance benefit, for some
5508 // applications.  Previously, as threads were freed from the hot team, they
5509 // would be placed back on the free list in inverse order.  If the hot team
5510 // grew back to it's original size, then the freed thread would be placed
5511 // back on the hot team in reverse order.  This could cause bad cache
5512 // locality problems on programs where the size of the hot team regularly
5513 // grew and shrunk.
5514 //
5515 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5516 void __kmp_free_thread(kmp_info_t *this_th) {
5517   int gtid;
5518   kmp_info_t **scan;
5519   kmp_root_t *root = this_th->th.th_root;
5520 
5521   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5522                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5523 
5524   KMP_DEBUG_ASSERT(this_th);
5525 
5526   // When moving thread to pool, switch thread to wait on own b_go flag, and
5527   // uninitialized (NULL team).
5528   int b;
5529   kmp_balign_t *balign = this_th->th.th_bar;
5530   for (b = 0; b < bs_last_barrier; ++b) {
5531     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5532       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5533     balign[b].bb.team = NULL;
5534     balign[b].bb.leaf_kids = 0;
5535   }
5536   this_th->th.th_task_state = 0;
5537 
5538   /* put thread back on the free pool */
5539   TCW_PTR(this_th->th.th_team, NULL);
5540   TCW_PTR(this_th->th.th_root, NULL);
5541   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5542 
5543   // If the __kmp_thread_pool_insert_pt is already past the new insert
5544   // point, then we need to re-scan the entire list.
5545   gtid = this_th->th.th_info.ds.ds_gtid;
5546   if (__kmp_thread_pool_insert_pt != NULL) {
5547     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5548     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5549       __kmp_thread_pool_insert_pt = NULL;
5550     }
5551   }
5552 
5553   // Scan down the list to find the place to insert the thread.
5554   // scan is the address of a link in the list, possibly the address of
5555   // __kmp_thread_pool itself.
5556   //
5557   // In the absence of nested parallism, the for loop will have 0 iterations.
5558   if (__kmp_thread_pool_insert_pt != NULL) {
5559     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5560   } else {
5561     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5562   }
5563   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5564        scan = &((*scan)->th.th_next_pool))
5565     ;
5566 
5567   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5568   // to its address.
5569   TCW_PTR(this_th->th.th_next_pool, *scan);
5570   __kmp_thread_pool_insert_pt = *scan = this_th;
5571   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5572                    (this_th->th.th_info.ds.ds_gtid <
5573                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5574   TCW_4(this_th->th.th_in_pool, TRUE);
5575   __kmp_thread_pool_nth++;
5576 
5577   TCW_4(__kmp_nth, __kmp_nth - 1);
5578   root->r.r_cg_nthreads--;
5579 
5580 #ifdef KMP_ADJUST_BLOCKTIME
5581   /* Adjust blocktime back to user setting or default if necessary */
5582   /* Middle initialization might never have occurred                */
5583   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5584     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5585     if (__kmp_nth <= __kmp_avail_proc) {
5586       __kmp_zero_bt = FALSE;
5587     }
5588   }
5589 #endif /* KMP_ADJUST_BLOCKTIME */
5590 
5591   KMP_MB();
5592 }
5593 
5594 /* ------------------------------------------------------------------------ */
5595 
5596 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5597   int gtid = this_thr->th.th_info.ds.ds_gtid;
5598   /*    void                 *stack_data;*/
5599   kmp_team_t *(*volatile pteam);
5600 
5601   KMP_MB();
5602   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5603 
5604   if (__kmp_env_consistency_check) {
5605     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5606   }
5607 
5608 #if OMPT_SUPPORT
5609   ompt_data_t *thread_data;
5610   if (ompt_enabled.enabled) {
5611     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5612     thread_data->ptr = NULL;
5613 
5614     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5615     this_thr->th.ompt_thread_info.wait_id = 0;
5616     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5617     if (ompt_enabled.ompt_callback_thread_begin) {
5618       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5619           ompt_thread_worker, thread_data);
5620     }
5621   }
5622 #endif
5623 
5624 #if OMPT_SUPPORT
5625   if (ompt_enabled.enabled) {
5626     this_thr->th.ompt_thread_info.state = omp_state_idle;
5627   }
5628 #endif
5629   /* This is the place where threads wait for work */
5630   while (!TCR_4(__kmp_global.g.g_done)) {
5631     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5632     KMP_MB();
5633 
5634     /* wait for work to do */
5635     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5636 
5637     /* No tid yet since not part of a team */
5638     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5639 
5640 #if OMPT_SUPPORT
5641     if (ompt_enabled.enabled) {
5642       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5643     }
5644 #endif
5645 
5646     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5647 
5648     /* have we been allocated? */
5649     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5650       /* we were just woken up, so run our new task */
5651       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5652         int rc;
5653         KA_TRACE(20,
5654                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5655                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5656                   (*pteam)->t.t_pkfn));
5657 
5658         updateHWFPControl(*pteam);
5659 
5660 #if OMPT_SUPPORT
5661         if (ompt_enabled.enabled) {
5662           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5663         }
5664 #endif
5665 
5666         {
5667           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5668           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5669           rc = (*pteam)->t.t_invoke(gtid);
5670         }
5671         KMP_ASSERT(rc);
5672 
5673         KMP_MB();
5674         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5675                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5676                       (*pteam)->t.t_pkfn));
5677       }
5678 #if OMPT_SUPPORT
5679       if (ompt_enabled.enabled) {
5680         /* no frame set while outside task */
5681         __ompt_get_task_info_object(0)->frame.exit_runtime_frame = NULL;
5682 
5683         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5684         this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
5685       }
5686 #endif
5687       /* join barrier after parallel region */
5688       __kmp_join_barrier(gtid);
5689     }
5690   }
5691   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5692 
5693 #if OMPT_SUPPORT
5694   if (ompt_enabled.ompt_callback_thread_end) {
5695     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5696   }
5697 #endif
5698 
5699   this_thr->th.th_task_team = NULL;
5700   /* run the destructors for the threadprivate data for this thread */
5701   __kmp_common_destroy_gtid(gtid);
5702 
5703   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5704   KMP_MB();
5705   return this_thr;
5706 }
5707 
5708 /* ------------------------------------------------------------------------ */
5709 
5710 void __kmp_internal_end_dest(void *specific_gtid) {
5711 #if KMP_COMPILER_ICC
5712 #pragma warning(push)
5713 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5714 // significant bits
5715 #endif
5716   // Make sure no significant bits are lost
5717   int gtid = (kmp_intptr_t)specific_gtid - 1;
5718 #if KMP_COMPILER_ICC
5719 #pragma warning(pop)
5720 #endif
5721 
5722   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5723   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5724    * this is because 0 is reserved for the nothing-stored case */
5725 
5726   /* josh: One reason for setting the gtid specific data even when it is being
5727      destroyed by pthread is to allow gtid lookup through thread specific data
5728      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5729      that gets executed in the call to __kmp_internal_end_thread, actually
5730      gets the gtid through the thread specific data.  Setting it here seems
5731      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5732      to run smoothly.
5733      todo: get rid of this after we remove the dependence on
5734      __kmp_gtid_get_specific  */
5735   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5736     __kmp_gtid_set_specific(gtid);
5737 #ifdef KMP_TDATA_GTID
5738   __kmp_gtid = gtid;
5739 #endif
5740   __kmp_internal_end_thread(gtid);
5741 }
5742 
5743 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5744 
5745 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5746 // destructors work perfectly, but in real libomp.so I have no evidence it is
5747 // ever called. However, -fini linker option in makefile.mk works fine.
5748 
5749 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5750   __kmp_internal_end_atexit();
5751 }
5752 
5753 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5754 
5755 #endif
5756 
5757 /* [Windows] josh: when the atexit handler is called, there may still be more
5758    than one thread alive */
5759 void __kmp_internal_end_atexit(void) {
5760   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5761   /* [Windows]
5762      josh: ideally, we want to completely shutdown the library in this atexit
5763      handler, but stat code that depends on thread specific data for gtid fails
5764      because that data becomes unavailable at some point during the shutdown, so
5765      we call __kmp_internal_end_thread instead. We should eventually remove the
5766      dependency on __kmp_get_specific_gtid in the stat code and use
5767      __kmp_internal_end_library to cleanly shutdown the library.
5768 
5769      // TODO: Can some of this comment about GVS be removed?
5770      I suspect that the offending stat code is executed when the calling thread
5771      tries to clean up a dead root thread's data structures, resulting in GVS
5772      code trying to close the GVS structures for that thread, but since the stat
5773      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5774      the calling thread is cleaning up itself instead of another thread, it get
5775      confused. This happens because allowing a thread to unregister and cleanup
5776      another thread is a recent modification for addressing an issue.
5777      Based on the current design (20050722), a thread may end up
5778      trying to unregister another thread only if thread death does not trigger
5779      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5780      thread specific data destructor function to detect thread death. For
5781      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5782      is nothing.  Thus, the workaround is applicable only for Windows static
5783      stat library. */
5784   __kmp_internal_end_library(-1);
5785 #if KMP_OS_WINDOWS
5786   __kmp_close_console();
5787 #endif
5788 }
5789 
5790 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5791   // It is assumed __kmp_forkjoin_lock is acquired.
5792 
5793   int gtid;
5794 
5795   KMP_DEBUG_ASSERT(thread != NULL);
5796 
5797   gtid = thread->th.th_info.ds.ds_gtid;
5798 
5799   if (!is_root) {
5800 
5801     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5802       /* Assume the threads are at the fork barrier here */
5803       KA_TRACE(
5804           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5805                gtid));
5806       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5807        * (GEH) */
5808       ANNOTATE_HAPPENS_BEFORE(thread);
5809       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5810       __kmp_release_64(&flag);
5811     }
5812 
5813     // Terminate OS thread.
5814     __kmp_reap_worker(thread);
5815 
5816     // The thread was killed asynchronously.  If it was actively
5817     // spinning in the thread pool, decrement the global count.
5818     //
5819     // There is a small timing hole here - if the worker thread was just waking
5820     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5821     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5822     // the global counter might not get updated.
5823     //
5824     // Currently, this can only happen as the library is unloaded,
5825     // so there are no harmful side effects.
5826     if (thread->th.th_active_in_pool) {
5827       thread->th.th_active_in_pool = FALSE;
5828       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5829       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5830     }
5831 
5832     // Decrement # of [worker] threads in the pool.
5833     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5834     --__kmp_thread_pool_nth;
5835   }
5836 
5837   __kmp_free_implicit_task(thread);
5838 
5839 // Free the fast memory for tasking
5840 #if USE_FAST_MEMORY
5841   __kmp_free_fast_memory(thread);
5842 #endif /* USE_FAST_MEMORY */
5843 
5844   __kmp_suspend_uninitialize_thread(thread);
5845 
5846   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5847   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5848 
5849   --__kmp_all_nth;
5850 // __kmp_nth was decremented when thread is added to the pool.
5851 
5852 #ifdef KMP_ADJUST_BLOCKTIME
5853   /* Adjust blocktime back to user setting or default if necessary */
5854   /* Middle initialization might never have occurred                */
5855   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5856     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5857     if (__kmp_nth <= __kmp_avail_proc) {
5858       __kmp_zero_bt = FALSE;
5859     }
5860   }
5861 #endif /* KMP_ADJUST_BLOCKTIME */
5862 
5863   /* free the memory being used */
5864   if (__kmp_env_consistency_check) {
5865     if (thread->th.th_cons) {
5866       __kmp_free_cons_stack(thread->th.th_cons);
5867       thread->th.th_cons = NULL;
5868     }
5869   }
5870 
5871   if (thread->th.th_pri_common != NULL) {
5872     __kmp_free(thread->th.th_pri_common);
5873     thread->th.th_pri_common = NULL;
5874   }
5875 
5876   if (thread->th.th_task_state_memo_stack != NULL) {
5877     __kmp_free(thread->th.th_task_state_memo_stack);
5878     thread->th.th_task_state_memo_stack = NULL;
5879   }
5880 
5881 #if KMP_USE_BGET
5882   if (thread->th.th_local.bget_data != NULL) {
5883     __kmp_finalize_bget(thread);
5884   }
5885 #endif
5886 
5887 #if KMP_AFFINITY_SUPPORTED
5888   if (thread->th.th_affin_mask != NULL) {
5889     KMP_CPU_FREE(thread->th.th_affin_mask);
5890     thread->th.th_affin_mask = NULL;
5891   }
5892 #endif /* KMP_AFFINITY_SUPPORTED */
5893 
5894   __kmp_reap_team(thread->th.th_serial_team);
5895   thread->th.th_serial_team = NULL;
5896   __kmp_free(thread);
5897 
5898   KMP_MB();
5899 
5900 } // __kmp_reap_thread
5901 
5902 static void __kmp_internal_end(void) {
5903   int i;
5904 
5905   /* First, unregister the library */
5906   __kmp_unregister_library();
5907 
5908 #if KMP_OS_WINDOWS
5909   /* In Win static library, we can't tell when a root actually dies, so we
5910      reclaim the data structures for any root threads that have died but not
5911      unregistered themselves, in order to shut down cleanly.
5912      In Win dynamic library we also can't tell when a thread dies.  */
5913   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5914 // dead roots
5915 #endif
5916 
5917   for (i = 0; i < __kmp_threads_capacity; i++)
5918     if (__kmp_root[i])
5919       if (__kmp_root[i]->r.r_active)
5920         break;
5921   KMP_MB(); /* Flush all pending memory write invalidates.  */
5922   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5923 
5924   if (i < __kmp_threads_capacity) {
5925 #if KMP_USE_MONITOR
5926     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5927     KMP_MB(); /* Flush all pending memory write invalidates.  */
5928 
5929     // Need to check that monitor was initialized before reaping it. If we are
5930     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5931     // __kmp_monitor will appear to contain valid data, but it is only valid in
5932     // the parent process, not the child.
5933     // New behavior (201008): instead of keying off of the flag
5934     // __kmp_init_parallel, the monitor thread creation is keyed off
5935     // of the new flag __kmp_init_monitor.
5936     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5937     if (TCR_4(__kmp_init_monitor)) {
5938       __kmp_reap_monitor(&__kmp_monitor);
5939       TCW_4(__kmp_init_monitor, 0);
5940     }
5941     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5942     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5943 #endif // KMP_USE_MONITOR
5944   } else {
5945 /* TODO move this to cleanup code */
5946 #ifdef KMP_DEBUG
5947     /* make sure that everything has properly ended */
5948     for (i = 0; i < __kmp_threads_capacity; i++) {
5949       if (__kmp_root[i]) {
5950         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5951         //                    there can be uber threads alive here
5952         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5953       }
5954     }
5955 #endif
5956 
5957     KMP_MB();
5958 
5959     // Reap the worker threads.
5960     // This is valid for now, but be careful if threads are reaped sooner.
5961     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5962       // Get the next thread from the pool.
5963       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5964       __kmp_thread_pool = thread->th.th_next_pool;
5965       // Reap it.
5966       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5967       thread->th.th_next_pool = NULL;
5968       thread->th.th_in_pool = FALSE;
5969       __kmp_reap_thread(thread, 0);
5970     }
5971     __kmp_thread_pool_insert_pt = NULL;
5972 
5973     // Reap teams.
5974     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5975       // Get the next team from the pool.
5976       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5977       __kmp_team_pool = team->t.t_next_pool;
5978       // Reap it.
5979       team->t.t_next_pool = NULL;
5980       __kmp_reap_team(team);
5981     }
5982 
5983     __kmp_reap_task_teams();
5984 
5985     for (i = 0; i < __kmp_threads_capacity; ++i) {
5986       // TBD: Add some checking...
5987       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5988     }
5989 
5990     /* Make sure all threadprivate destructors get run by joining with all
5991        worker threads before resetting this flag */
5992     TCW_SYNC_4(__kmp_init_common, FALSE);
5993 
5994     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5995     KMP_MB();
5996 
5997 #if KMP_USE_MONITOR
5998     // See note above: One of the possible fixes for CQ138434 / CQ140126
5999     //
6000     // FIXME: push both code fragments down and CSE them?
6001     // push them into __kmp_cleanup() ?
6002     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6003     if (TCR_4(__kmp_init_monitor)) {
6004       __kmp_reap_monitor(&__kmp_monitor);
6005       TCW_4(__kmp_init_monitor, 0);
6006     }
6007     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6008     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6009 #endif
6010   } /* else !__kmp_global.t_active */
6011   TCW_4(__kmp_init_gtid, FALSE);
6012   KMP_MB(); /* Flush all pending memory write invalidates.  */
6013 
6014   __kmp_cleanup();
6015 #if OMPT_SUPPORT
6016   ompt_fini();
6017 #endif
6018 }
6019 
6020 void __kmp_internal_end_library(int gtid_req) {
6021   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6022   /* this shouldn't be a race condition because __kmp_internal_end() is the
6023      only place to clear __kmp_serial_init */
6024   /* we'll check this later too, after we get the lock */
6025   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6026   // redundaant, because the next check will work in any case.
6027   if (__kmp_global.g.g_abort) {
6028     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6029     /* TODO abort? */
6030     return;
6031   }
6032   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6033     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6034     return;
6035   }
6036 
6037   KMP_MB(); /* Flush all pending memory write invalidates.  */
6038 
6039   /* find out who we are and what we should do */
6040   {
6041     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6042     KA_TRACE(
6043         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6044     if (gtid == KMP_GTID_SHUTDOWN) {
6045       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6046                     "already shutdown\n"));
6047       return;
6048     } else if (gtid == KMP_GTID_MONITOR) {
6049       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6050                     "registered, or system shutdown\n"));
6051       return;
6052     } else if (gtid == KMP_GTID_DNE) {
6053       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6054                     "shutdown\n"));
6055       /* we don't know who we are, but we may still shutdown the library */
6056     } else if (KMP_UBER_GTID(gtid)) {
6057       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6058       if (__kmp_root[gtid]->r.r_active) {
6059         __kmp_global.g.g_abort = -1;
6060         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6061         KA_TRACE(10,
6062                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6063                   gtid));
6064         return;
6065       } else {
6066         KA_TRACE(
6067             10,
6068             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6069         __kmp_unregister_root_current_thread(gtid);
6070       }
6071     } else {
6072 /* worker threads may call this function through the atexit handler, if they
6073  * call exit() */
6074 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6075    TODO: do a thorough shutdown instead */
6076 #ifdef DUMP_DEBUG_ON_EXIT
6077       if (__kmp_debug_buf)
6078         __kmp_dump_debug_buffer();
6079 #endif
6080       return;
6081     }
6082   }
6083   /* synchronize the termination process */
6084   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6085 
6086   /* have we already finished */
6087   if (__kmp_global.g.g_abort) {
6088     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6089     /* TODO abort? */
6090     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6091     return;
6092   }
6093   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6094     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6095     return;
6096   }
6097 
6098   /* We need this lock to enforce mutex between this reading of
6099      __kmp_threads_capacity and the writing by __kmp_register_root.
6100      Alternatively, we can use a counter of roots that is atomically updated by
6101      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6102      __kmp_internal_end_*.  */
6103   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6104 
6105   /* now we can safely conduct the actual termination */
6106   __kmp_internal_end();
6107 
6108   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6109   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6110 
6111   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6112 
6113 #ifdef DUMP_DEBUG_ON_EXIT
6114   if (__kmp_debug_buf)
6115     __kmp_dump_debug_buffer();
6116 #endif
6117 
6118 #if KMP_OS_WINDOWS
6119   __kmp_close_console();
6120 #endif
6121 
6122   __kmp_fini_allocator();
6123 
6124 } // __kmp_internal_end_library
6125 
6126 void __kmp_internal_end_thread(int gtid_req) {
6127   int i;
6128 
6129   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6130   /* this shouldn't be a race condition because __kmp_internal_end() is the
6131    * only place to clear __kmp_serial_init */
6132   /* we'll check this later too, after we get the lock */
6133   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6134   // redundant, because the next check will work in any case.
6135   if (__kmp_global.g.g_abort) {
6136     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6137     /* TODO abort? */
6138     return;
6139   }
6140   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6141     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6142     return;
6143   }
6144 
6145   KMP_MB(); /* Flush all pending memory write invalidates.  */
6146 
6147   /* find out who we are and what we should do */
6148   {
6149     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6150     KA_TRACE(10,
6151              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6152     if (gtid == KMP_GTID_SHUTDOWN) {
6153       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6154                     "already shutdown\n"));
6155       return;
6156     } else if (gtid == KMP_GTID_MONITOR) {
6157       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6158                     "registered, or system shutdown\n"));
6159       return;
6160     } else if (gtid == KMP_GTID_DNE) {
6161       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6162                     "shutdown\n"));
6163       return;
6164       /* we don't know who we are */
6165     } else if (KMP_UBER_GTID(gtid)) {
6166       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6167       if (__kmp_root[gtid]->r.r_active) {
6168         __kmp_global.g.g_abort = -1;
6169         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6170         KA_TRACE(10,
6171                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6172                   gtid));
6173         return;
6174       } else {
6175         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6176                       gtid));
6177         __kmp_unregister_root_current_thread(gtid);
6178       }
6179     } else {
6180       /* just a worker thread, let's leave */
6181       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6182 
6183       if (gtid >= 0) {
6184         __kmp_threads[gtid]->th.th_task_team = NULL;
6185       }
6186 
6187       KA_TRACE(10,
6188                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6189                 gtid));
6190       return;
6191     }
6192   }
6193 #if defined KMP_DYNAMIC_LIB
6194   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6195   // thread, because we will better shutdown later in the library destructor.
6196   // The reason of this change is performance problem when non-openmp thread in
6197   // a loop forks and joins many openmp threads. We can save a lot of time
6198   // keeping worker threads alive until the program shutdown.
6199   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6200   // and Windows(DPD200287443) that occurs when using critical sections from
6201   // foreign threads.
6202   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6203   return;
6204 #endif
6205   /* synchronize the termination process */
6206   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6207 
6208   /* have we already finished */
6209   if (__kmp_global.g.g_abort) {
6210     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6211     /* TODO abort? */
6212     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6213     return;
6214   }
6215   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6216     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6217     return;
6218   }
6219 
6220   /* We need this lock to enforce mutex between this reading of
6221      __kmp_threads_capacity and the writing by __kmp_register_root.
6222      Alternatively, we can use a counter of roots that is atomically updated by
6223      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6224      __kmp_internal_end_*.  */
6225 
6226   /* should we finish the run-time?  are all siblings done? */
6227   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6228 
6229   for (i = 0; i < __kmp_threads_capacity; ++i) {
6230     if (KMP_UBER_GTID(i)) {
6231       KA_TRACE(
6232           10,
6233           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6234       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6235       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6236       return;
6237     }
6238   }
6239 
6240   /* now we can safely conduct the actual termination */
6241 
6242   __kmp_internal_end();
6243 
6244   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6245   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6246 
6247   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6248 
6249 #ifdef DUMP_DEBUG_ON_EXIT
6250   if (__kmp_debug_buf)
6251     __kmp_dump_debug_buffer();
6252 #endif
6253 } // __kmp_internal_end_thread
6254 
6255 // -----------------------------------------------------------------------------
6256 // Library registration stuff.
6257 
6258 static long __kmp_registration_flag = 0;
6259 // Random value used to indicate library initialization.
6260 static char *__kmp_registration_str = NULL;
6261 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6262 
6263 static inline char *__kmp_reg_status_name() {
6264   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6265      each thread. If registration and unregistration go in different threads
6266      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6267      env var can not be found, because the name will contain different pid. */
6268   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6269 } // __kmp_reg_status_get
6270 
6271 void __kmp_register_library_startup(void) {
6272 
6273   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6274   int done = 0;
6275   union {
6276     double dtime;
6277     long ltime;
6278   } time;
6279 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6280   __kmp_initialize_system_tick();
6281 #endif
6282   __kmp_read_system_time(&time.dtime);
6283   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6284   __kmp_registration_str =
6285       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6286                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6287 
6288   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6289                 __kmp_registration_str));
6290 
6291   while (!done) {
6292 
6293     char *value = NULL; // Actual value of the environment variable.
6294 
6295     // Set environment variable, but do not overwrite if it is exist.
6296     __kmp_env_set(name, __kmp_registration_str, 0);
6297     // Check the variable is written.
6298     value = __kmp_env_get(name);
6299     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6300 
6301       done = 1; // Ok, environment variable set successfully, exit the loop.
6302 
6303     } else {
6304 
6305       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6306       // Check whether it alive or dead.
6307       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6308       char *tail = value;
6309       char *flag_addr_str = NULL;
6310       char *flag_val_str = NULL;
6311       char const *file_name = NULL;
6312       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6313       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6314       file_name = tail;
6315       if (tail != NULL) {
6316         long *flag_addr = 0;
6317         long flag_val = 0;
6318         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6319         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6320         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6321           // First, check whether environment-encoded address is mapped into
6322           // addr space.
6323           // If so, dereference it to see if it still has the right value.
6324           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6325             neighbor = 1;
6326           } else {
6327             // If not, then we know the other copy of the library is no longer
6328             // running.
6329             neighbor = 2;
6330           }
6331         }
6332       }
6333       switch (neighbor) {
6334       case 0: // Cannot parse environment variable -- neighbor status unknown.
6335         // Assume it is the incompatible format of future version of the
6336         // library. Assume the other library is alive.
6337         // WARN( ... ); // TODO: Issue a warning.
6338         file_name = "unknown library";
6339       // Attention! Falling to the next case. That's intentional.
6340       case 1: { // Neighbor is alive.
6341         // Check it is allowed.
6342         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6343         if (!__kmp_str_match_true(duplicate_ok)) {
6344           // That's not allowed. Issue fatal error.
6345           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6346                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6347         }
6348         KMP_INTERNAL_FREE(duplicate_ok);
6349         __kmp_duplicate_library_ok = 1;
6350         done = 1; // Exit the loop.
6351       } break;
6352       case 2: { // Neighbor is dead.
6353         // Clear the variable and try to register library again.
6354         __kmp_env_unset(name);
6355       } break;
6356       default: { KMP_DEBUG_ASSERT(0); } break;
6357       }
6358     }
6359     KMP_INTERNAL_FREE((void *)value);
6360   }
6361   KMP_INTERNAL_FREE((void *)name);
6362 
6363 } // func __kmp_register_library_startup
6364 
6365 void __kmp_unregister_library(void) {
6366 
6367   char *name = __kmp_reg_status_name();
6368   char *value = __kmp_env_get(name);
6369 
6370   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6371   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6372   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6373     // Ok, this is our variable. Delete it.
6374     __kmp_env_unset(name);
6375   }
6376 
6377   KMP_INTERNAL_FREE(__kmp_registration_str);
6378   KMP_INTERNAL_FREE(value);
6379   KMP_INTERNAL_FREE(name);
6380 
6381   __kmp_registration_flag = 0;
6382   __kmp_registration_str = NULL;
6383 
6384 } // __kmp_unregister_library
6385 
6386 // End of Library registration stuff.
6387 // -----------------------------------------------------------------------------
6388 
6389 #if KMP_MIC_SUPPORTED
6390 
6391 static void __kmp_check_mic_type() {
6392   kmp_cpuid_t cpuid_state = {0};
6393   kmp_cpuid_t *cs_p = &cpuid_state;
6394   __kmp_x86_cpuid(1, 0, cs_p);
6395   // We don't support mic1 at the moment
6396   if ((cs_p->eax & 0xff0) == 0xB10) {
6397     __kmp_mic_type = mic2;
6398   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6399     __kmp_mic_type = mic3;
6400   } else {
6401     __kmp_mic_type = non_mic;
6402   }
6403 }
6404 
6405 #endif /* KMP_MIC_SUPPORTED */
6406 
6407 static void __kmp_do_serial_initialize(void) {
6408   int i, gtid;
6409   int size;
6410 
6411   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6412 
6413   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6414   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6415   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6416   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6417   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6418 
6419 #if OMPT_SUPPORT
6420   ompt_pre_init();
6421 #endif
6422 
6423   __kmp_validate_locks();
6424 
6425   /* Initialize internal memory allocator */
6426   __kmp_init_allocator();
6427 
6428   /* Register the library startup via an environment variable and check to see
6429      whether another copy of the library is already registered. */
6430 
6431   __kmp_register_library_startup();
6432 
6433   /* TODO reinitialization of library */
6434   if (TCR_4(__kmp_global.g.g_done)) {
6435     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6436   }
6437 
6438   __kmp_global.g.g_abort = 0;
6439   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6440 
6441 /* initialize the locks */
6442 #if KMP_USE_ADAPTIVE_LOCKS
6443 #if KMP_DEBUG_ADAPTIVE_LOCKS
6444   __kmp_init_speculative_stats();
6445 #endif
6446 #endif
6447 #if KMP_STATS_ENABLED
6448   __kmp_stats_init();
6449 #endif
6450   __kmp_init_lock(&__kmp_global_lock);
6451   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6452   __kmp_init_lock(&__kmp_debug_lock);
6453   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6454   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6455   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6456   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6457   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6458   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6459   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6460   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6461   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6462   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6463   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6464   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6465   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6466   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6467   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6468 #if KMP_USE_MONITOR
6469   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6470 #endif
6471   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6472 
6473   /* conduct initialization and initial setup of configuration */
6474 
6475   __kmp_runtime_initialize();
6476 
6477 #if KMP_MIC_SUPPORTED
6478   __kmp_check_mic_type();
6479 #endif
6480 
6481 // Some global variable initialization moved here from kmp_env_initialize()
6482 #ifdef KMP_DEBUG
6483   kmp_diag = 0;
6484 #endif
6485   __kmp_abort_delay = 0;
6486 
6487   // From __kmp_init_dflt_team_nth()
6488   /* assume the entire machine will be used */
6489   __kmp_dflt_team_nth_ub = __kmp_xproc;
6490   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6491     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6492   }
6493   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6494     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6495   }
6496   __kmp_max_nth = __kmp_sys_max_nth;
6497   __kmp_cg_max_nth = __kmp_sys_max_nth;
6498   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6499   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6500     __kmp_teams_max_nth = __kmp_sys_max_nth;
6501   }
6502 
6503   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6504   // part
6505   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6506 #if KMP_USE_MONITOR
6507   __kmp_monitor_wakeups =
6508       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6509   __kmp_bt_intervals =
6510       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6511 #endif
6512   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6513   __kmp_library = library_throughput;
6514   // From KMP_SCHEDULE initialization
6515   __kmp_static = kmp_sch_static_balanced;
6516 // AC: do not use analytical here, because it is non-monotonous
6517 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6518 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6519 // need to repeat assignment
6520 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6521 // bit control and barrier method control parts
6522 #if KMP_FAST_REDUCTION_BARRIER
6523 #define kmp_reduction_barrier_gather_bb ((int)1)
6524 #define kmp_reduction_barrier_release_bb ((int)1)
6525 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6526 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6527 #endif // KMP_FAST_REDUCTION_BARRIER
6528   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6529     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6530     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6531     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6532     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6533 #if KMP_FAST_REDUCTION_BARRIER
6534     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6535       // lin_64 ): hyper,1
6536       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6537       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6538       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6539       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6540     }
6541 #endif // KMP_FAST_REDUCTION_BARRIER
6542   }
6543 #if KMP_FAST_REDUCTION_BARRIER
6544 #undef kmp_reduction_barrier_release_pat
6545 #undef kmp_reduction_barrier_gather_pat
6546 #undef kmp_reduction_barrier_release_bb
6547 #undef kmp_reduction_barrier_gather_bb
6548 #endif // KMP_FAST_REDUCTION_BARRIER
6549 #if KMP_MIC_SUPPORTED
6550   if (__kmp_mic_type == mic2) { // KNC
6551     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6552     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6553     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6554         1; // forkjoin release
6555     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6556     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6557   }
6558 #if KMP_FAST_REDUCTION_BARRIER
6559   if (__kmp_mic_type == mic2) { // KNC
6560     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6561     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6562   }
6563 #endif // KMP_FAST_REDUCTION_BARRIER
6564 #endif // KMP_MIC_SUPPORTED
6565 
6566 // From KMP_CHECKS initialization
6567 #ifdef KMP_DEBUG
6568   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6569 #else
6570   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6571 #endif
6572 
6573   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6574   __kmp_foreign_tp = TRUE;
6575 
6576   __kmp_global.g.g_dynamic = FALSE;
6577   __kmp_global.g.g_dynamic_mode = dynamic_default;
6578 
6579   __kmp_env_initialize(NULL);
6580 
6581 // Print all messages in message catalog for testing purposes.
6582 #ifdef KMP_DEBUG
6583   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6584   if (__kmp_str_match_true(val)) {
6585     kmp_str_buf_t buffer;
6586     __kmp_str_buf_init(&buffer);
6587     __kmp_i18n_dump_catalog(&buffer);
6588     __kmp_printf("%s", buffer.str);
6589     __kmp_str_buf_free(&buffer);
6590   }
6591   __kmp_env_free(&val);
6592 #endif
6593 
6594   __kmp_threads_capacity =
6595       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6596   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6597   __kmp_tp_capacity = __kmp_default_tp_capacity(
6598       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6599 
6600   // If the library is shut down properly, both pools must be NULL. Just in
6601   // case, set them to NULL -- some memory may leak, but subsequent code will
6602   // work even if pools are not freed.
6603   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6604   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6605   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6606   __kmp_thread_pool = NULL;
6607   __kmp_thread_pool_insert_pt = NULL;
6608   __kmp_team_pool = NULL;
6609 
6610   /* Allocate all of the variable sized records */
6611   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6612    * expandable */
6613   /* Since allocation is cache-aligned, just add extra padding at the end */
6614   size =
6615       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6616       CACHE_LINE;
6617   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6618   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6619                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6620 
6621   /* init thread counts */
6622   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6623                    0); // Asserts fail if the library is reinitializing and
6624   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6625   __kmp_all_nth = 0;
6626   __kmp_nth = 0;
6627 
6628   /* setup the uber master thread and hierarchy */
6629   gtid = __kmp_register_root(TRUE);
6630   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6631   KMP_ASSERT(KMP_UBER_GTID(gtid));
6632   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6633 
6634   KMP_MB(); /* Flush all pending memory write invalidates.  */
6635 
6636   __kmp_common_initialize();
6637 
6638 #if KMP_OS_UNIX
6639   /* invoke the child fork handler */
6640   __kmp_register_atfork();
6641 #endif
6642 
6643 #if !defined KMP_DYNAMIC_LIB
6644   {
6645     /* Invoke the exit handler when the program finishes, only for static
6646        library. For dynamic library, we already have _fini and DllMain. */
6647     int rc = atexit(__kmp_internal_end_atexit);
6648     if (rc != 0) {
6649       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6650                   __kmp_msg_null);
6651     }
6652   }
6653 #endif
6654 
6655 #if KMP_HANDLE_SIGNALS
6656 #if KMP_OS_UNIX
6657   /* NOTE: make sure that this is called before the user installs their own
6658      signal handlers so that the user handlers are called first. this way they
6659      can return false, not call our handler, avoid terminating the library, and
6660      continue execution where they left off. */
6661   __kmp_install_signals(FALSE);
6662 #endif /* KMP_OS_UNIX */
6663 #if KMP_OS_WINDOWS
6664   __kmp_install_signals(TRUE);
6665 #endif /* KMP_OS_WINDOWS */
6666 #endif
6667 
6668   /* we have finished the serial initialization */
6669   __kmp_init_counter++;
6670 
6671   __kmp_init_serial = TRUE;
6672 
6673   if (__kmp_settings) {
6674     __kmp_env_print();
6675   }
6676 
6677 #if OMP_40_ENABLED
6678   if (__kmp_display_env || __kmp_display_env_verbose) {
6679     __kmp_env_print_2();
6680   }
6681 #endif // OMP_40_ENABLED
6682 
6683 #if OMPT_SUPPORT
6684   ompt_post_init();
6685 #endif
6686 
6687   KMP_MB();
6688 
6689   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6690 }
6691 
6692 void __kmp_serial_initialize(void) {
6693   if (__kmp_init_serial) {
6694     return;
6695   }
6696   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6697   if (__kmp_init_serial) {
6698     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6699     return;
6700   }
6701   __kmp_do_serial_initialize();
6702   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6703 }
6704 
6705 static void __kmp_do_middle_initialize(void) {
6706   int i, j;
6707   int prev_dflt_team_nth;
6708 
6709   if (!__kmp_init_serial) {
6710     __kmp_do_serial_initialize();
6711   }
6712 
6713   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6714 
6715   // Save the previous value for the __kmp_dflt_team_nth so that
6716   // we can avoid some reinitialization if it hasn't changed.
6717   prev_dflt_team_nth = __kmp_dflt_team_nth;
6718 
6719 #if KMP_AFFINITY_SUPPORTED
6720   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6721   // number of cores on the machine.
6722   __kmp_affinity_initialize();
6723 
6724   // Run through the __kmp_threads array and set the affinity mask
6725   // for each root thread that is currently registered with the RTL.
6726   for (i = 0; i < __kmp_threads_capacity; i++) {
6727     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6728       __kmp_affinity_set_init_mask(i, TRUE);
6729     }
6730   }
6731 #endif /* KMP_AFFINITY_SUPPORTED */
6732 
6733   KMP_ASSERT(__kmp_xproc > 0);
6734   if (__kmp_avail_proc == 0) {
6735     __kmp_avail_proc = __kmp_xproc;
6736   }
6737 
6738   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6739   // correct them now
6740   j = 0;
6741   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6742     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6743         __kmp_avail_proc;
6744     j++;
6745   }
6746 
6747   if (__kmp_dflt_team_nth == 0) {
6748 #ifdef KMP_DFLT_NTH_CORES
6749     // Default #threads = #cores
6750     __kmp_dflt_team_nth = __kmp_ncores;
6751     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6752                   "__kmp_ncores (%d)\n",
6753                   __kmp_dflt_team_nth));
6754 #else
6755     // Default #threads = #available OS procs
6756     __kmp_dflt_team_nth = __kmp_avail_proc;
6757     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6758                   "__kmp_avail_proc(%d)\n",
6759                   __kmp_dflt_team_nth));
6760 #endif /* KMP_DFLT_NTH_CORES */
6761   }
6762 
6763   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6764     __kmp_dflt_team_nth = KMP_MIN_NTH;
6765   }
6766   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6767     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6768   }
6769 
6770   // There's no harm in continuing if the following check fails,
6771   // but it indicates an error in the previous logic.
6772   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6773 
6774   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6775     // Run through the __kmp_threads array and set the num threads icv for each
6776     // root thread that is currently registered with the RTL (which has not
6777     // already explicitly set its nthreads-var with a call to
6778     // omp_set_num_threads()).
6779     for (i = 0; i < __kmp_threads_capacity; i++) {
6780       kmp_info_t *thread = __kmp_threads[i];
6781       if (thread == NULL)
6782         continue;
6783       if (thread->th.th_current_task->td_icvs.nproc != 0)
6784         continue;
6785 
6786       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6787     }
6788   }
6789   KA_TRACE(
6790       20,
6791       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6792        __kmp_dflt_team_nth));
6793 
6794 #ifdef KMP_ADJUST_BLOCKTIME
6795   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6796   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6797     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6798     if (__kmp_nth > __kmp_avail_proc) {
6799       __kmp_zero_bt = TRUE;
6800     }
6801   }
6802 #endif /* KMP_ADJUST_BLOCKTIME */
6803 
6804   /* we have finished middle initialization */
6805   TCW_SYNC_4(__kmp_init_middle, TRUE);
6806 
6807   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6808 }
6809 
6810 void __kmp_middle_initialize(void) {
6811   if (__kmp_init_middle) {
6812     return;
6813   }
6814   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6815   if (__kmp_init_middle) {
6816     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6817     return;
6818   }
6819   __kmp_do_middle_initialize();
6820   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6821 }
6822 
6823 void __kmp_parallel_initialize(void) {
6824   int gtid = __kmp_entry_gtid(); // this might be a new root
6825 
6826   /* synchronize parallel initialization (for sibling) */
6827   if (TCR_4(__kmp_init_parallel))
6828     return;
6829   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6830   if (TCR_4(__kmp_init_parallel)) {
6831     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6832     return;
6833   }
6834 
6835   /* TODO reinitialization after we have already shut down */
6836   if (TCR_4(__kmp_global.g.g_done)) {
6837     KA_TRACE(
6838         10,
6839         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6840     __kmp_infinite_loop();
6841   }
6842 
6843   /* jc: The lock __kmp_initz_lock is already held, so calling
6844      __kmp_serial_initialize would cause a deadlock.  So we call
6845      __kmp_do_serial_initialize directly. */
6846   if (!__kmp_init_middle) {
6847     __kmp_do_middle_initialize();
6848   }
6849 
6850   /* begin initialization */
6851   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6852   KMP_ASSERT(KMP_UBER_GTID(gtid));
6853 
6854 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6855   // Save the FP control regs.
6856   // Worker threads will set theirs to these values at thread startup.
6857   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6858   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6859   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6860 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6861 
6862 #if KMP_OS_UNIX
6863 #if KMP_HANDLE_SIGNALS
6864   /*  must be after __kmp_serial_initialize  */
6865   __kmp_install_signals(TRUE);
6866 #endif
6867 #endif
6868 
6869   __kmp_suspend_initialize();
6870 
6871 #if defined(USE_LOAD_BALANCE)
6872   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6873     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6874   }
6875 #else
6876   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6877     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6878   }
6879 #endif
6880 
6881   if (__kmp_version) {
6882     __kmp_print_version_2();
6883   }
6884 
6885   /* we have finished parallel initialization */
6886   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6887 
6888   KMP_MB();
6889   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6890 
6891   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6892 }
6893 
6894 /* ------------------------------------------------------------------------ */
6895 
6896 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6897                                    kmp_team_t *team) {
6898   kmp_disp_t *dispatch;
6899 
6900   KMP_MB();
6901 
6902   /* none of the threads have encountered any constructs, yet. */
6903   this_thr->th.th_local.this_construct = 0;
6904 #if KMP_CACHE_MANAGE
6905   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6906 #endif /* KMP_CACHE_MANAGE */
6907   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6908   KMP_DEBUG_ASSERT(dispatch);
6909   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6910   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6911   // this_thr->th.th_info.ds.ds_tid ] );
6912 
6913   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6914 #if OMP_45_ENABLED
6915   dispatch->th_doacross_buf_idx =
6916       0; /* reset the doacross dispatch buffer counter */
6917 #endif
6918   if (__kmp_env_consistency_check)
6919     __kmp_push_parallel(gtid, team->t.t_ident);
6920 
6921   KMP_MB(); /* Flush all pending memory write invalidates.  */
6922 }
6923 
6924 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6925                                   kmp_team_t *team) {
6926   if (__kmp_env_consistency_check)
6927     __kmp_pop_parallel(gtid, team->t.t_ident);
6928 
6929   __kmp_finish_implicit_task(this_thr);
6930 }
6931 
6932 int __kmp_invoke_task_func(int gtid) {
6933   int rc;
6934   int tid = __kmp_tid_from_gtid(gtid);
6935   kmp_info_t *this_thr = __kmp_threads[gtid];
6936   kmp_team_t *team = this_thr->th.th_team;
6937 
6938   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6939 #if USE_ITT_BUILD
6940   if (__itt_stack_caller_create_ptr) {
6941     __kmp_itt_stack_callee_enter(
6942         (__itt_caller)
6943             team->t.t_stack_id); // inform ittnotify about entering user's code
6944   }
6945 #endif /* USE_ITT_BUILD */
6946 #if INCLUDE_SSC_MARKS
6947   SSC_MARK_INVOKING();
6948 #endif
6949 
6950 #if OMPT_SUPPORT
6951   void *dummy;
6952   void **exit_runtime_p;
6953   ompt_data_t *my_task_data;
6954   ompt_data_t *my_parallel_data;
6955   int ompt_team_size;
6956 
6957   if (ompt_enabled.enabled) {
6958     exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]
6959                            .ompt_task_info.frame.exit_runtime_frame);
6960   } else {
6961     exit_runtime_p = &dummy;
6962   }
6963 
6964   my_task_data =
6965       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6966   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6967   if (ompt_enabled.ompt_callback_implicit_task) {
6968     ompt_team_size = team->t.t_nproc;
6969     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6970         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6971         __kmp_tid_from_gtid(gtid));
6972   }
6973 #endif
6974 
6975   {
6976     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6977     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6978     rc =
6979         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6980                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6981 #if OMPT_SUPPORT
6982                                ,
6983                                exit_runtime_p
6984 #endif
6985                                );
6986 #if OMPT_SUPPORT
6987     *exit_runtime_p = NULL;
6988 #endif
6989   }
6990 
6991 #if USE_ITT_BUILD
6992   if (__itt_stack_caller_create_ptr) {
6993     __kmp_itt_stack_callee_leave(
6994         (__itt_caller)
6995             team->t.t_stack_id); // inform ittnotify about leaving user's code
6996   }
6997 #endif /* USE_ITT_BUILD */
6998   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6999 
7000   return rc;
7001 }
7002 
7003 #if OMP_40_ENABLED
7004 void __kmp_teams_master(int gtid) {
7005   // This routine is called by all master threads in teams construct
7006   kmp_info_t *thr = __kmp_threads[gtid];
7007   kmp_team_t *team = thr->th.th_team;
7008   ident_t *loc = team->t.t_ident;
7009   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7010   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7011   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7012   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7013                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7014 // Launch league of teams now, but not let workers execute
7015 // (they hang on fork barrier until next parallel)
7016 #if INCLUDE_SSC_MARKS
7017   SSC_MARK_FORKING();
7018 #endif
7019   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7020                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7021                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7022 #if INCLUDE_SSC_MARKS
7023   SSC_MARK_JOINING();
7024 #endif
7025 
7026   // AC: last parameter "1" eliminates join barrier which won't work because
7027   // worker threads are in a fork barrier waiting for more parallel regions
7028   __kmp_join_call(loc, gtid
7029 #if OMPT_SUPPORT
7030                   ,
7031                   fork_context_intel
7032 #endif
7033                   ,
7034                   1);
7035 }
7036 
7037 int __kmp_invoke_teams_master(int gtid) {
7038   kmp_info_t *this_thr = __kmp_threads[gtid];
7039   kmp_team_t *team = this_thr->th.th_team;
7040 #if KMP_DEBUG
7041   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7042     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7043                      (void *)__kmp_teams_master);
7044 #endif
7045   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7046   __kmp_teams_master(gtid);
7047   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7048   return 1;
7049 }
7050 #endif /* OMP_40_ENABLED */
7051 
7052 /* this sets the requested number of threads for the next parallel region
7053    encountered by this team. since this should be enclosed in the forkjoin
7054    critical section it should avoid race conditions with assymmetrical nested
7055    parallelism */
7056 
7057 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7058   kmp_info_t *thr = __kmp_threads[gtid];
7059 
7060   if (num_threads > 0)
7061     thr->th.th_set_nproc = num_threads;
7062 }
7063 
7064 #if OMP_40_ENABLED
7065 
7066 /* this sets the requested number of teams for the teams region and/or
7067    the number of threads for the next parallel region encountered  */
7068 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7069                           int num_threads) {
7070   kmp_info_t *thr = __kmp_threads[gtid];
7071   KMP_DEBUG_ASSERT(num_teams >= 0);
7072   KMP_DEBUG_ASSERT(num_threads >= 0);
7073 
7074   if (num_teams == 0)
7075     num_teams = 1; // default number of teams is 1.
7076   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7077     if (!__kmp_reserve_warn) {
7078       __kmp_reserve_warn = 1;
7079       __kmp_msg(kmp_ms_warning,
7080                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7081                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7082     }
7083     num_teams = __kmp_teams_max_nth;
7084   }
7085   // Set number of teams (number of threads in the outer "parallel" of the
7086   // teams)
7087   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7088 
7089   // Remember the number of threads for inner parallel regions
7090   if (num_threads == 0) {
7091     if (!TCR_4(__kmp_init_middle))
7092       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7093     num_threads = __kmp_avail_proc / num_teams;
7094     if (num_teams * num_threads > __kmp_teams_max_nth) {
7095       // adjust num_threads w/o warning as it is not user setting
7096       num_threads = __kmp_teams_max_nth / num_teams;
7097     }
7098   } else {
7099     if (num_teams * num_threads > __kmp_teams_max_nth) {
7100       int new_threads = __kmp_teams_max_nth / num_teams;
7101       if (!__kmp_reserve_warn) { // user asked for too many threads
7102         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7103         __kmp_msg(kmp_ms_warning,
7104                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7105                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7106       }
7107       num_threads = new_threads;
7108     }
7109   }
7110   thr->th.th_teams_size.nth = num_threads;
7111 }
7112 
7113 // Set the proc_bind var to use in the following parallel region.
7114 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7115   kmp_info_t *thr = __kmp_threads[gtid];
7116   thr->th.th_set_proc_bind = proc_bind;
7117 }
7118 
7119 #endif /* OMP_40_ENABLED */
7120 
7121 /* Launch the worker threads into the microtask. */
7122 
7123 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7124   kmp_info_t *this_thr = __kmp_threads[gtid];
7125 
7126 #ifdef KMP_DEBUG
7127   int f;
7128 #endif /* KMP_DEBUG */
7129 
7130   KMP_DEBUG_ASSERT(team);
7131   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7132   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7133   KMP_MB(); /* Flush all pending memory write invalidates.  */
7134 
7135   team->t.t_construct = 0; /* no single directives seen yet */
7136   team->t.t_ordered.dt.t_value =
7137       0; /* thread 0 enters the ordered section first */
7138 
7139   /* Reset the identifiers on the dispatch buffer */
7140   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7141   if (team->t.t_max_nproc > 1) {
7142     int i;
7143     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7144       team->t.t_disp_buffer[i].buffer_index = i;
7145 #if OMP_45_ENABLED
7146       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7147 #endif
7148     }
7149   } else {
7150     team->t.t_disp_buffer[0].buffer_index = 0;
7151 #if OMP_45_ENABLED
7152     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7153 #endif
7154   }
7155 
7156   KMP_MB(); /* Flush all pending memory write invalidates.  */
7157   KMP_ASSERT(this_thr->th.th_team == team);
7158 
7159 #ifdef KMP_DEBUG
7160   for (f = 0; f < team->t.t_nproc; f++) {
7161     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7162                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7163   }
7164 #endif /* KMP_DEBUG */
7165 
7166   /* release the worker threads so they may begin working */
7167   __kmp_fork_barrier(gtid, 0);
7168 }
7169 
7170 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7171   kmp_info_t *this_thr = __kmp_threads[gtid];
7172 
7173   KMP_DEBUG_ASSERT(team);
7174   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7175   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7176   KMP_MB(); /* Flush all pending memory write invalidates.  */
7177 
7178 /* Join barrier after fork */
7179 
7180 #ifdef KMP_DEBUG
7181   if (__kmp_threads[gtid] &&
7182       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7183     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7184                  __kmp_threads[gtid]);
7185     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7186                  "team->t.t_nproc=%d\n",
7187                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7188                  team->t.t_nproc);
7189     __kmp_print_structure();
7190   }
7191   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7192                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7193 #endif /* KMP_DEBUG */
7194 
7195   __kmp_join_barrier(gtid); /* wait for everyone */
7196 #if OMPT_SUPPORT
7197   int ds_tid = this_thr->th.th_info.ds.ds_tid;
7198   if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7199     ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr);
7200     ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr);
7201     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7202 #if OMPT_OPTIONAL
7203     void *codeptr = NULL;
7204     if (KMP_MASTER_TID(ds_tid) &&
7205         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7206          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7207       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7208 
7209     if (ompt_enabled.ompt_callback_sync_region_wait) {
7210       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7211           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7212     }
7213     if (ompt_enabled.ompt_callback_sync_region) {
7214       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7215           ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr);
7216     }
7217 #endif
7218     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7219       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7220           ompt_scope_end, NULL, tId, 0, ds_tid);
7221     }
7222     // return to idle state
7223     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7224   }
7225 #endif
7226 
7227   KMP_MB(); /* Flush all pending memory write invalidates.  */
7228   KMP_ASSERT(this_thr->th.th_team == team);
7229 }
7230 
7231 /* ------------------------------------------------------------------------ */
7232 
7233 #ifdef USE_LOAD_BALANCE
7234 
7235 // Return the worker threads actively spinning in the hot team, if we
7236 // are at the outermost level of parallelism.  Otherwise, return 0.
7237 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7238   int i;
7239   int retval;
7240   kmp_team_t *hot_team;
7241 
7242   if (root->r.r_active) {
7243     return 0;
7244   }
7245   hot_team = root->r.r_hot_team;
7246   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7247     return hot_team->t.t_nproc - 1; // Don't count master thread
7248   }
7249 
7250   // Skip the master thread - it is accounted for elsewhere.
7251   retval = 0;
7252   for (i = 1; i < hot_team->t.t_nproc; i++) {
7253     if (hot_team->t.t_threads[i]->th.th_active) {
7254       retval++;
7255     }
7256   }
7257   return retval;
7258 }
7259 
7260 // Perform an automatic adjustment to the number of
7261 // threads used by the next parallel region.
7262 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7263   int retval;
7264   int pool_active;
7265   int hot_team_active;
7266   int team_curr_active;
7267   int system_active;
7268 
7269   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7270                 set_nproc));
7271   KMP_DEBUG_ASSERT(root);
7272   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7273                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7274   KMP_DEBUG_ASSERT(set_nproc > 1);
7275 
7276   if (set_nproc == 1) {
7277     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7278     return 1;
7279   }
7280 
7281   // Threads that are active in the thread pool, active in the hot team for this
7282   // particular root (if we are at the outer par level), and the currently
7283   // executing thread (to become the master) are available to add to the new
7284   // team, but are currently contributing to the system load, and must be
7285   // accounted for.
7286   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7287   hot_team_active = __kmp_active_hot_team_nproc(root);
7288   team_curr_active = pool_active + hot_team_active + 1;
7289 
7290   // Check the system load.
7291   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7292   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7293                 "hot team active = %d\n",
7294                 system_active, pool_active, hot_team_active));
7295 
7296   if (system_active < 0) {
7297     // There was an error reading the necessary info from /proc, so use the
7298     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7299     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7300     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7301     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7302 
7303     // Make this call behave like the thread limit algorithm.
7304     retval = __kmp_avail_proc - __kmp_nth +
7305              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7306     if (retval > set_nproc) {
7307       retval = set_nproc;
7308     }
7309     if (retval < KMP_MIN_NTH) {
7310       retval = KMP_MIN_NTH;
7311     }
7312 
7313     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7314                   retval));
7315     return retval;
7316   }
7317 
7318   // There is a slight delay in the load balance algorithm in detecting new
7319   // running procs. The real system load at this instant should be at least as
7320   // large as the #active omp thread that are available to add to the team.
7321   if (system_active < team_curr_active) {
7322     system_active = team_curr_active;
7323   }
7324   retval = __kmp_avail_proc - system_active + team_curr_active;
7325   if (retval > set_nproc) {
7326     retval = set_nproc;
7327   }
7328   if (retval < KMP_MIN_NTH) {
7329     retval = KMP_MIN_NTH;
7330   }
7331 
7332   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7333   return retval;
7334 } // __kmp_load_balance_nproc()
7335 
7336 #endif /* USE_LOAD_BALANCE */
7337 
7338 /* ------------------------------------------------------------------------ */
7339 
7340 /* NOTE: this is called with the __kmp_init_lock held */
7341 void __kmp_cleanup(void) {
7342   int f;
7343 
7344   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7345 
7346   if (TCR_4(__kmp_init_parallel)) {
7347 #if KMP_HANDLE_SIGNALS
7348     __kmp_remove_signals();
7349 #endif
7350     TCW_4(__kmp_init_parallel, FALSE);
7351   }
7352 
7353   if (TCR_4(__kmp_init_middle)) {
7354 #if KMP_AFFINITY_SUPPORTED
7355     __kmp_affinity_uninitialize();
7356 #endif /* KMP_AFFINITY_SUPPORTED */
7357     __kmp_cleanup_hierarchy();
7358     TCW_4(__kmp_init_middle, FALSE);
7359   }
7360 
7361   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7362 
7363   if (__kmp_init_serial) {
7364     __kmp_runtime_destroy();
7365     __kmp_init_serial = FALSE;
7366   }
7367 
7368   for (f = 0; f < __kmp_threads_capacity; f++) {
7369     if (__kmp_root[f] != NULL) {
7370       __kmp_free(__kmp_root[f]);
7371       __kmp_root[f] = NULL;
7372     }
7373   }
7374   __kmp_free(__kmp_threads);
7375   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7376   // there is no need in freeing __kmp_root.
7377   __kmp_threads = NULL;
7378   __kmp_root = NULL;
7379   __kmp_threads_capacity = 0;
7380 
7381 #if KMP_USE_DYNAMIC_LOCK
7382   __kmp_cleanup_indirect_user_locks();
7383 #else
7384   __kmp_cleanup_user_locks();
7385 #endif
7386 
7387 #if KMP_AFFINITY_SUPPORTED
7388   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7389   __kmp_cpuinfo_file = NULL;
7390 #endif /* KMP_AFFINITY_SUPPORTED */
7391 
7392 #if KMP_USE_ADAPTIVE_LOCKS
7393 #if KMP_DEBUG_ADAPTIVE_LOCKS
7394   __kmp_print_speculative_stats();
7395 #endif
7396 #endif
7397   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7398   __kmp_nested_nth.nth = NULL;
7399   __kmp_nested_nth.size = 0;
7400   __kmp_nested_nth.used = 0;
7401   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7402   __kmp_nested_proc_bind.bind_types = NULL;
7403   __kmp_nested_proc_bind.size = 0;
7404   __kmp_nested_proc_bind.used = 0;
7405 
7406   __kmp_i18n_catclose();
7407 
7408 #if KMP_STATS_ENABLED
7409   __kmp_stats_fini();
7410 #endif
7411 
7412   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7413 }
7414 
7415 /* ------------------------------------------------------------------------ */
7416 
7417 int __kmp_ignore_mppbeg(void) {
7418   char *env;
7419 
7420   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7421     if (__kmp_str_match_false(env))
7422       return FALSE;
7423   }
7424   // By default __kmpc_begin() is no-op.
7425   return TRUE;
7426 }
7427 
7428 int __kmp_ignore_mppend(void) {
7429   char *env;
7430 
7431   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7432     if (__kmp_str_match_false(env))
7433       return FALSE;
7434   }
7435   // By default __kmpc_end() is no-op.
7436   return TRUE;
7437 }
7438 
7439 void __kmp_internal_begin(void) {
7440   int gtid;
7441   kmp_root_t *root;
7442 
7443   /* this is a very important step as it will register new sibling threads
7444      and assign these new uber threads a new gtid */
7445   gtid = __kmp_entry_gtid();
7446   root = __kmp_threads[gtid]->th.th_root;
7447   KMP_ASSERT(KMP_UBER_GTID(gtid));
7448 
7449   if (root->r.r_begin)
7450     return;
7451   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7452   if (root->r.r_begin) {
7453     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7454     return;
7455   }
7456 
7457   root->r.r_begin = TRUE;
7458 
7459   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7460 }
7461 
7462 /* ------------------------------------------------------------------------ */
7463 
7464 void __kmp_user_set_library(enum library_type arg) {
7465   int gtid;
7466   kmp_root_t *root;
7467   kmp_info_t *thread;
7468 
7469   /* first, make sure we are initialized so we can get our gtid */
7470 
7471   gtid = __kmp_entry_gtid();
7472   thread = __kmp_threads[gtid];
7473 
7474   root = thread->th.th_root;
7475 
7476   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7477                 library_serial));
7478   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7479                                   thread */
7480     KMP_WARNING(SetLibraryIncorrectCall);
7481     return;
7482   }
7483 
7484   switch (arg) {
7485   case library_serial:
7486     thread->th.th_set_nproc = 0;
7487     set__nproc(thread, 1);
7488     break;
7489   case library_turnaround:
7490     thread->th.th_set_nproc = 0;
7491     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7492                                            : __kmp_dflt_team_nth_ub);
7493     break;
7494   case library_throughput:
7495     thread->th.th_set_nproc = 0;
7496     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7497                                            : __kmp_dflt_team_nth_ub);
7498     break;
7499   default:
7500     KMP_FATAL(UnknownLibraryType, arg);
7501   }
7502 
7503   __kmp_aux_set_library(arg);
7504 }
7505 
7506 void __kmp_aux_set_stacksize(size_t arg) {
7507   if (!__kmp_init_serial)
7508     __kmp_serial_initialize();
7509 
7510 #if KMP_OS_DARWIN
7511   if (arg & (0x1000 - 1)) {
7512     arg &= ~(0x1000 - 1);
7513     if (arg + 0x1000) /* check for overflow if we round up */
7514       arg += 0x1000;
7515   }
7516 #endif
7517   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7518 
7519   /* only change the default stacksize before the first parallel region */
7520   if (!TCR_4(__kmp_init_parallel)) {
7521     size_t value = arg; /* argument is in bytes */
7522 
7523     if (value < __kmp_sys_min_stksize)
7524       value = __kmp_sys_min_stksize;
7525     else if (value > KMP_MAX_STKSIZE)
7526       value = KMP_MAX_STKSIZE;
7527 
7528     __kmp_stksize = value;
7529 
7530     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7531   }
7532 
7533   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7534 }
7535 
7536 /* set the behaviour of the runtime library */
7537 /* TODO this can cause some odd behaviour with sibling parallelism... */
7538 void __kmp_aux_set_library(enum library_type arg) {
7539   __kmp_library = arg;
7540 
7541   switch (__kmp_library) {
7542   case library_serial: {
7543     KMP_INFORM(LibraryIsSerial);
7544     (void)__kmp_change_library(TRUE);
7545   } break;
7546   case library_turnaround:
7547     (void)__kmp_change_library(TRUE);
7548     break;
7549   case library_throughput:
7550     (void)__kmp_change_library(FALSE);
7551     break;
7552   default:
7553     KMP_FATAL(UnknownLibraryType, arg);
7554   }
7555 }
7556 
7557 /* ------------------------------------------------------------------------ */
7558 
7559 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7560   int blocktime = arg; /* argument is in milliseconds */
7561 #if KMP_USE_MONITOR
7562   int bt_intervals;
7563 #endif
7564   int bt_set;
7565 
7566   __kmp_save_internal_controls(thread);
7567 
7568   /* Normalize and set blocktime for the teams */
7569   if (blocktime < KMP_MIN_BLOCKTIME)
7570     blocktime = KMP_MIN_BLOCKTIME;
7571   else if (blocktime > KMP_MAX_BLOCKTIME)
7572     blocktime = KMP_MAX_BLOCKTIME;
7573 
7574   set__blocktime_team(thread->th.th_team, tid, blocktime);
7575   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7576 
7577 #if KMP_USE_MONITOR
7578   /* Calculate and set blocktime intervals for the teams */
7579   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7580 
7581   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7582   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7583 #endif
7584 
7585   /* Set whether blocktime has been set to "TRUE" */
7586   bt_set = TRUE;
7587 
7588   set__bt_set_team(thread->th.th_team, tid, bt_set);
7589   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7590 #if KMP_USE_MONITOR
7591   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7592                 "bt_intervals=%d, monitor_updates=%d\n",
7593                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7594                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7595                 __kmp_monitor_wakeups));
7596 #else
7597   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7598                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7599                 thread->th.th_team->t.t_id, tid, blocktime));
7600 #endif
7601 }
7602 
7603 void __kmp_aux_set_defaults(char const *str, int len) {
7604   if (!__kmp_init_serial) {
7605     __kmp_serial_initialize();
7606   }
7607   __kmp_env_initialize(str);
7608 
7609   if (__kmp_settings
7610 #if OMP_40_ENABLED
7611       || __kmp_display_env || __kmp_display_env_verbose
7612 #endif // OMP_40_ENABLED
7613       ) {
7614     __kmp_env_print();
7615   }
7616 } // __kmp_aux_set_defaults
7617 
7618 /* ------------------------------------------------------------------------ */
7619 /* internal fast reduction routines */
7620 
7621 PACKED_REDUCTION_METHOD_T
7622 __kmp_determine_reduction_method(
7623     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7624     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7625     kmp_critical_name *lck) {
7626 
7627   // Default reduction method: critical construct ( lck != NULL, like in current
7628   // PAROPT )
7629   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7630   // can be selected by RTL
7631   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7632   // can be selected by RTL
7633   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7634   // among generated by PAROPT.
7635 
7636   PACKED_REDUCTION_METHOD_T retval;
7637 
7638   int team_size;
7639 
7640   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7641   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7642 
7643 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7644   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7645 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7646 
7647   retval = critical_reduce_block;
7648 
7649   // another choice of getting a team size (with 1 dynamic deference) is slower
7650   team_size = __kmp_get_team_num_threads(global_tid);
7651   if (team_size == 1) {
7652 
7653     retval = empty_reduce_block;
7654 
7655   } else {
7656 
7657     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7658     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7659 
7660 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7661 
7662 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7663     KMP_OS_DARWIN
7664 
7665     int teamsize_cutoff = 4;
7666 
7667 #if KMP_MIC_SUPPORTED
7668     if (__kmp_mic_type != non_mic) {
7669       teamsize_cutoff = 8;
7670     }
7671 #endif
7672     if (tree_available) {
7673       if (team_size <= teamsize_cutoff) {
7674         if (atomic_available) {
7675           retval = atomic_reduce_block;
7676         }
7677       } else {
7678         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7679       }
7680     } else if (atomic_available) {
7681       retval = atomic_reduce_block;
7682     }
7683 #else
7684 #error "Unknown or unsupported OS"
7685 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7686 // KMP_OS_DARWIN
7687 
7688 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7689 
7690 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7691 
7692     // basic tuning
7693 
7694     if (atomic_available) {
7695       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7696         retval = atomic_reduce_block;
7697       }
7698     } // otherwise: use critical section
7699 
7700 #elif KMP_OS_DARWIN
7701 
7702     if (atomic_available && (num_vars <= 3)) {
7703       retval = atomic_reduce_block;
7704     } else if (tree_available) {
7705       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7706           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7707         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7708       }
7709     } // otherwise: use critical section
7710 
7711 #else
7712 #error "Unknown or unsupported OS"
7713 #endif
7714 
7715 #else
7716 #error "Unknown or unsupported architecture"
7717 #endif
7718   }
7719 
7720   // KMP_FORCE_REDUCTION
7721 
7722   // If the team is serialized (team_size == 1), ignore the forced reduction
7723   // method and stay with the unsynchronized method (empty_reduce_block)
7724   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7725       team_size != 1) {
7726 
7727     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7728 
7729     int atomic_available, tree_available;
7730 
7731     switch ((forced_retval = __kmp_force_reduction_method)) {
7732     case critical_reduce_block:
7733       KMP_ASSERT(lck); // lck should be != 0
7734       break;
7735 
7736     case atomic_reduce_block:
7737       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7738       if (!atomic_available) {
7739         KMP_WARNING(RedMethodNotSupported, "atomic");
7740         forced_retval = critical_reduce_block;
7741       }
7742       break;
7743 
7744     case tree_reduce_block:
7745       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7746       if (!tree_available) {
7747         KMP_WARNING(RedMethodNotSupported, "tree");
7748         forced_retval = critical_reduce_block;
7749       } else {
7750 #if KMP_FAST_REDUCTION_BARRIER
7751         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7752 #endif
7753       }
7754       break;
7755 
7756     default:
7757       KMP_ASSERT(0); // "unsupported method specified"
7758     }
7759 
7760     retval = forced_retval;
7761   }
7762 
7763   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7764 
7765 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7766 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7767 
7768   return (retval);
7769 }
7770 
7771 // this function is for testing set/get/determine reduce method
7772 kmp_int32 __kmp_get_reduce_method(void) {
7773   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7774 }
7775