1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376     ompt_lw_taskteam_t lw_taskteam;
1377     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378                             &ompt_parallel_data, codeptr);
1379 
1380     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381     // don't use lw_taskteam after linking. content was swaped
1382 
1383     /* OMPT implicit task begin */
1384     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385     if (ompt_enabled.ompt_callback_implicit_task) {
1386       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389       OMPT_CUR_TASK_INFO(this_thr)
1390           ->thread_num = __kmp_tid_from_gtid(global_tid);
1391     }
1392 
1393     /* OMPT state */
1394     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1395     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1396   }
1397 #endif
1398 }
1399 
1400 /* most of the work for a fork */
1401 /* return true if we really went parallel, false if serialized */
1402 int __kmp_fork_call(ident_t *loc, int gtid,
1403                     enum fork_context_e call_context, // Intel, GNU, ...
1404                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1405 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1406 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1407                     va_list *ap
1408 #else
1409                     va_list ap
1410 #endif
1411                     ) {
1412   void **argv;
1413   int i;
1414   int master_tid;
1415   int master_this_cons;
1416   kmp_team_t *team;
1417   kmp_team_t *parent_team;
1418   kmp_info_t *master_th;
1419   kmp_root_t *root;
1420   int nthreads;
1421   int master_active;
1422   int master_set_numthreads;
1423   int level;
1424 #if OMP_40_ENABLED
1425   int active_level;
1426   int teams_level;
1427 #endif
1428 #if KMP_NESTED_HOT_TEAMS
1429   kmp_hot_team_ptr_t **p_hot_teams;
1430 #endif
1431   { // KMP_TIME_BLOCK
1432     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1433     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1434 
1435     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1436     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1437       /* Some systems prefer the stack for the root thread(s) to start with */
1438       /* some gap from the parent stack to prevent false sharing. */
1439       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1440       /* These 2 lines below are so this does not get optimized out */
1441       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1442         __kmp_stkpadding += (short)((kmp_int64)dummy);
1443     }
1444 
1445     /* initialize if needed */
1446     KMP_DEBUG_ASSERT(
1447         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1448     if (!TCR_4(__kmp_init_parallel))
1449       __kmp_parallel_initialize();
1450 
1451     /* setup current data */
1452     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1453     // shutdown
1454     parent_team = master_th->th.th_team;
1455     master_tid = master_th->th.th_info.ds.ds_tid;
1456     master_this_cons = master_th->th.th_local.this_construct;
1457     root = master_th->th.th_root;
1458     master_active = root->r.r_active;
1459     master_set_numthreads = master_th->th.th_set_nproc;
1460 
1461 #if OMPT_SUPPORT
1462     ompt_data_t ompt_parallel_data;
1463     ompt_parallel_data.ptr = NULL;
1464     ompt_data_t *parent_task_data;
1465     omp_frame_t *ompt_frame;
1466     ompt_data_t *implicit_task_data;
1467     void *return_address = NULL;
1468 
1469     if (ompt_enabled.enabled) {
1470       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1471                                     NULL, NULL);
1472       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1473     }
1474 #endif
1475 
1476     // Nested level will be an index in the nested nthreads array
1477     level = parent_team->t.t_level;
1478     // used to launch non-serial teams even if nested is not allowed
1479     active_level = parent_team->t.t_active_level;
1480 #if OMP_40_ENABLED
1481     // needed to check nesting inside the teams
1482     teams_level = master_th->th.th_teams_level;
1483 #endif
1484 #if KMP_NESTED_HOT_TEAMS
1485     p_hot_teams = &master_th->th.th_hot_teams;
1486     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1487       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1488           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1489       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1490       // it is either actual or not needed (when active_level > 0)
1491       (*p_hot_teams)[0].hot_team_nth = 1;
1492     }
1493 #endif
1494 
1495 #if OMPT_SUPPORT
1496     if (ompt_enabled.enabled) {
1497       if (ompt_enabled.ompt_callback_parallel_begin) {
1498         int team_size = master_set_numthreads
1499                             ? master_set_numthreads
1500                             : get__nproc_2(parent_team, master_tid);
1501         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1502             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1503             OMPT_INVOKER(call_context), return_address);
1504       }
1505       master_th->th.ompt_thread_info.state = omp_state_overhead;
1506     }
1507 #endif
1508 
1509     master_th->th.th_ident = loc;
1510 
1511 #if OMP_40_ENABLED
1512     if (master_th->th.th_teams_microtask && ap &&
1513         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1514       // AC: This is start of parallel that is nested inside teams construct.
1515       // The team is actual (hot), all workers are ready at the fork barrier.
1516       // No lock needed to initialize the team a bit, then free workers.
1517       parent_team->t.t_ident = loc;
1518       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1519       parent_team->t.t_argc = argc;
1520       argv = (void **)parent_team->t.t_argv;
1521       for (i = argc - 1; i >= 0; --i)
1522 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1523 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1524         *argv++ = va_arg(*ap, void *);
1525 #else
1526         *argv++ = va_arg(ap, void *);
1527 #endif
1528       // Increment our nested depth levels, but not increase the serialization
1529       if (parent_team == master_th->th.th_serial_team) {
1530         // AC: we are in serialized parallel
1531         __kmpc_serialized_parallel(loc, gtid);
1532         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1533         // AC: need this in order enquiry functions work
1534         // correctly, will restore at join time
1535         parent_team->t.t_serialized--;
1536 #if OMPT_SUPPORT
1537         void *dummy;
1538         void **exit_runtime_p;
1539 
1540         ompt_lw_taskteam_t lw_taskteam;
1541 
1542         if (ompt_enabled.enabled) {
1543           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544                                   &ompt_parallel_data, return_address);
1545           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1546 
1547           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1548           // don't use lw_taskteam after linking. content was swaped
1549 
1550           /* OMPT implicit task begin */
1551           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1552           if (ompt_enabled.ompt_callback_implicit_task) {
1553             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1554                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1555                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1556             OMPT_CUR_TASK_INFO(master_th)
1557                 ->thread_num = __kmp_tid_from_gtid(gtid);
1558           }
1559 
1560           /* OMPT state */
1561           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1562         } else {
1563           exit_runtime_p = &dummy;
1564         }
1565 #endif
1566 
1567         {
1568           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1569           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1570           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1571 #if OMPT_SUPPORT
1572                                  ,
1573                                  exit_runtime_p
1574 #endif
1575                                  );
1576         }
1577 
1578 #if OMPT_SUPPORT
1579         *exit_runtime_p = NULL;
1580         if (ompt_enabled.enabled) {
1581           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1582           if (ompt_enabled.ompt_callback_implicit_task) {
1583             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1584                 ompt_scope_end, NULL, implicit_task_data, 1,
1585                 OMPT_CUR_TASK_INFO(master_th)->thread_num);
1586           }
1587           __ompt_lw_taskteam_unlink(master_th);
1588 
1589           if (ompt_enabled.ompt_callback_parallel_end) {
1590             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1591                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1592                 OMPT_INVOKER(call_context), return_address);
1593           }
1594           master_th->th.ompt_thread_info.state = omp_state_overhead;
1595         }
1596 #endif
1597         return TRUE;
1598       }
1599 
1600       parent_team->t.t_pkfn = microtask;
1601       parent_team->t.t_invoke = invoker;
1602       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1603       parent_team->t.t_active_level++;
1604       parent_team->t.t_level++;
1605 
1606       /* Change number of threads in the team if requested */
1607       if (master_set_numthreads) { // The parallel has num_threads clause
1608         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1609           // AC: only can reduce number of threads dynamically, can't increase
1610           kmp_info_t **other_threads = parent_team->t.t_threads;
1611           parent_team->t.t_nproc = master_set_numthreads;
1612           for (i = 0; i < master_set_numthreads; ++i) {
1613             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1614           }
1615           // Keep extra threads hot in the team for possible next parallels
1616         }
1617         master_th->th.th_set_nproc = 0;
1618       }
1619 
1620 #if USE_DEBUGGER
1621       if (__kmp_debugging) { // Let debugger override number of threads.
1622         int nth = __kmp_omp_num_threads(loc);
1623         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1624           master_set_numthreads = nth;
1625         }
1626       }
1627 #endif
1628 
1629       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1630                     "master_th=%p, gtid=%d\n",
1631                     root, parent_team, master_th, gtid));
1632       __kmp_internal_fork(loc, gtid, parent_team);
1633       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1634                     "master_th=%p, gtid=%d\n",
1635                     root, parent_team, master_th, gtid));
1636 
1637       /* Invoke microtask for MASTER thread */
1638       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1639                     parent_team->t.t_id, parent_team->t.t_pkfn));
1640 
1641       {
1642         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1643         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1644         if (!parent_team->t.t_invoke(gtid)) {
1645           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1646         }
1647       }
1648       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1649                     parent_team->t.t_id, parent_team->t.t_pkfn));
1650       KMP_MB(); /* Flush all pending memory write invalidates.  */
1651 
1652       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1653 
1654       return TRUE;
1655     } // Parallel closely nested in teams construct
1656 #endif /* OMP_40_ENABLED */
1657 
1658 #if KMP_DEBUG
1659     if (__kmp_tasking_mode != tskm_immediate_exec) {
1660       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1661                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1662     }
1663 #endif
1664 
1665     if (parent_team->t.t_active_level >=
1666         master_th->th.th_current_task->td_icvs.max_active_levels) {
1667       nthreads = 1;
1668     } else {
1669 #if OMP_40_ENABLED
1670       int enter_teams = ((ap == NULL && active_level == 0) ||
1671                          (ap && teams_level > 0 && teams_level == level));
1672 #endif
1673       nthreads =
1674           master_set_numthreads
1675               ? master_set_numthreads
1676               : get__nproc_2(
1677                     parent_team,
1678                     master_tid); // TODO: get nproc directly from current task
1679 
1680       // Check if we need to take forkjoin lock? (no need for serialized
1681       // parallel out of teams construct). This code moved here from
1682       // __kmp_reserve_threads() to speedup nested serialized parallels.
1683       if (nthreads > 1) {
1684         if ((!get__nested(master_th) && (root->r.r_in_parallel
1685 #if OMP_40_ENABLED
1686                                          && !enter_teams
1687 #endif /* OMP_40_ENABLED */
1688                                          )) ||
1689             (__kmp_library == library_serial)) {
1690           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1691                         " threads\n",
1692                         gtid, nthreads));
1693           nthreads = 1;
1694         }
1695       }
1696       if (nthreads > 1) {
1697         /* determine how many new threads we can use */
1698         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1699         nthreads = __kmp_reserve_threads(
1700             root, parent_team, master_tid, nthreads
1701 #if OMP_40_ENABLED
1702             /* AC: If we execute teams from parallel region (on host), then
1703                teams should be created but each can only have 1 thread if
1704                nesting is disabled. If teams called from serial region, then
1705                teams and their threads should be created regardless of the
1706                nesting setting. */
1707             ,
1708             enter_teams
1709 #endif /* OMP_40_ENABLED */
1710             );
1711         if (nthreads == 1) {
1712           // Free lock for single thread execution here; for multi-thread
1713           // execution it will be freed later after team of threads created
1714           // and initialized
1715           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1716         }
1717       }
1718     }
1719     KMP_DEBUG_ASSERT(nthreads > 0);
1720 
1721     // If we temporarily changed the set number of threads then restore it now
1722     master_th->th.th_set_nproc = 0;
1723 
1724     /* create a serialized parallel region? */
1725     if (nthreads == 1) {
1726 /* josh todo: hypothetical question: what do we do for OS X*? */
1727 #if KMP_OS_LINUX &&                                                            \
1728     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1729       void *args[argc];
1730 #else
1731       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1732 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1733           KMP_ARCH_AARCH64) */
1734 
1735       KA_TRACE(20,
1736                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1737 
1738       __kmpc_serialized_parallel(loc, gtid);
1739 
1740       if (call_context == fork_context_intel) {
1741         /* TODO this sucks, use the compiler itself to pass args! :) */
1742         master_th->th.th_serial_team->t.t_ident = loc;
1743 #if OMP_40_ENABLED
1744         if (!ap) {
1745           // revert change made in __kmpc_serialized_parallel()
1746           master_th->th.th_serial_team->t.t_level--;
1747 // Get args from parent team for teams construct
1748 
1749 #if OMPT_SUPPORT
1750           void *dummy;
1751           void **exit_runtime_p;
1752           ompt_task_info_t *task_info;
1753 
1754           ompt_lw_taskteam_t lw_taskteam;
1755 
1756           if (ompt_enabled.enabled) {
1757             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1758                                     &ompt_parallel_data, return_address);
1759 
1760             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1761             // don't use lw_taskteam after linking. content was swaped
1762 
1763             task_info = OMPT_CUR_TASK_INFO(master_th);
1764             exit_runtime_p = &(task_info->frame.exit_frame);
1765             if (ompt_enabled.ompt_callback_implicit_task) {
1766               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1767                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1768                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1769               OMPT_CUR_TASK_INFO(master_th)
1770                   ->thread_num = __kmp_tid_from_gtid(gtid);
1771             }
1772 
1773             /* OMPT state */
1774             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1775           } else {
1776             exit_runtime_p = &dummy;
1777           }
1778 #endif
1779 
1780           {
1781             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1782             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1783             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1784                                    parent_team->t.t_argv
1785 #if OMPT_SUPPORT
1786                                    ,
1787                                    exit_runtime_p
1788 #endif
1789                                    );
1790           }
1791 
1792 #if OMPT_SUPPORT
1793           if (ompt_enabled.enabled) {
1794             exit_runtime_p = NULL;
1795             if (ompt_enabled.ompt_callback_implicit_task) {
1796               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1797                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1798                   OMPT_CUR_TASK_INFO(master_th)->thread_num);
1799             }
1800 
1801             __ompt_lw_taskteam_unlink(master_th);
1802             if (ompt_enabled.ompt_callback_parallel_end) {
1803               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1804                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1805                   OMPT_INVOKER(call_context), return_address);
1806             }
1807             master_th->th.ompt_thread_info.state = omp_state_overhead;
1808           }
1809 #endif
1810         } else if (microtask == (microtask_t)__kmp_teams_master) {
1811           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1812                            master_th->th.th_serial_team);
1813           team = master_th->th.th_team;
1814           // team->t.t_pkfn = microtask;
1815           team->t.t_invoke = invoker;
1816           __kmp_alloc_argv_entries(argc, team, TRUE);
1817           team->t.t_argc = argc;
1818           argv = (void **)team->t.t_argv;
1819           if (ap) {
1820             for (i = argc - 1; i >= 0; --i)
1821 // TODO: revert workaround for Intel(R) 64 tracker #96
1822 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1823               *argv++ = va_arg(*ap, void *);
1824 #else
1825               *argv++ = va_arg(ap, void *);
1826 #endif
1827           } else {
1828             for (i = 0; i < argc; ++i)
1829               // Get args from parent team for teams construct
1830               argv[i] = parent_team->t.t_argv[i];
1831           }
1832           // AC: revert change made in __kmpc_serialized_parallel()
1833           //     because initial code in teams should have level=0
1834           team->t.t_level--;
1835           // AC: call special invoker for outer "parallel" of teams construct
1836           {
1837             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1838             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1839             invoker(gtid);
1840           }
1841         } else {
1842 #endif /* OMP_40_ENABLED */
1843           argv = args;
1844           for (i = argc - 1; i >= 0; --i)
1845 // TODO: revert workaround for Intel(R) 64 tracker #96
1846 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1847             *argv++ = va_arg(*ap, void *);
1848 #else
1849           *argv++ = va_arg(ap, void *);
1850 #endif
1851           KMP_MB();
1852 
1853 #if OMPT_SUPPORT
1854           void *dummy;
1855           void **exit_runtime_p;
1856           ompt_task_info_t *task_info;
1857 
1858           ompt_lw_taskteam_t lw_taskteam;
1859 
1860           if (ompt_enabled.enabled) {
1861             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1862                                     &ompt_parallel_data, return_address);
1863             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1864             // don't use lw_taskteam after linking. content was swaped
1865             task_info = OMPT_CUR_TASK_INFO(master_th);
1866             exit_runtime_p = &(task_info->frame.exit_frame);
1867 
1868             /* OMPT implicit task begin */
1869             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1870             if (ompt_enabled.ompt_callback_implicit_task) {
1871               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1872                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1873                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1874               OMPT_CUR_TASK_INFO(master_th)
1875                   ->thread_num = __kmp_tid_from_gtid(gtid);
1876             }
1877 
1878             /* OMPT state */
1879             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1880           } else {
1881             exit_runtime_p = &dummy;
1882           }
1883 #endif
1884 
1885           {
1886             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1887             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1888             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1889 #if OMPT_SUPPORT
1890                                    ,
1891                                    exit_runtime_p
1892 #endif
1893                                    );
1894           }
1895 
1896 #if OMPT_SUPPORT
1897           if (ompt_enabled.enabled) {
1898             *exit_runtime_p = NULL;
1899             if (ompt_enabled.ompt_callback_implicit_task) {
1900               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1902                   OMPT_CUR_TASK_INFO(master_th)->thread_num);
1903             }
1904 
1905             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1906             __ompt_lw_taskteam_unlink(master_th);
1907             if (ompt_enabled.ompt_callback_parallel_end) {
1908               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1909                   &ompt_parallel_data, parent_task_data,
1910                   OMPT_INVOKER(call_context), return_address);
1911             }
1912             master_th->th.ompt_thread_info.state = omp_state_overhead;
1913           }
1914 #endif
1915 #if OMP_40_ENABLED
1916         }
1917 #endif /* OMP_40_ENABLED */
1918       } else if (call_context == fork_context_gnu) {
1919 #if OMPT_SUPPORT
1920         ompt_lw_taskteam_t lwt;
1921         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1922                                 return_address);
1923 
1924         lwt.ompt_task_info.frame.exit_frame = NULL;
1925         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1926 // don't use lw_taskteam after linking. content was swaped
1927 #endif
1928 
1929         // we were called from GNU native code
1930         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1931         return FALSE;
1932       } else {
1933         KMP_ASSERT2(call_context < fork_context_last,
1934                     "__kmp_fork_call: unknown fork_context parameter");
1935       }
1936 
1937       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1938       KMP_MB();
1939       return FALSE;
1940     }
1941 
1942     // GEH: only modify the executing flag in the case when not serialized
1943     //      serialized case is handled in kmpc_serialized_parallel
1944     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1945                   "curtask=%p, curtask_max_aclevel=%d\n",
1946                   parent_team->t.t_active_level, master_th,
1947                   master_th->th.th_current_task,
1948                   master_th->th.th_current_task->td_icvs.max_active_levels));
1949     // TODO: GEH - cannot do this assertion because root thread not set up as
1950     // executing
1951     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1952     master_th->th.th_current_task->td_flags.executing = 0;
1953 
1954 #if OMP_40_ENABLED
1955     if (!master_th->th.th_teams_microtask || level > teams_level)
1956 #endif /* OMP_40_ENABLED */
1957     {
1958       /* Increment our nested depth level */
1959       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1960     }
1961 
1962     // See if we need to make a copy of the ICVs.
1963     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1964     if ((level + 1 < __kmp_nested_nth.used) &&
1965         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1966       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1967     } else {
1968       nthreads_icv = 0; // don't update
1969     }
1970 
1971 #if OMP_40_ENABLED
1972     // Figure out the proc_bind_policy for the new team.
1973     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1974     kmp_proc_bind_t proc_bind_icv =
1975         proc_bind_default; // proc_bind_default means don't update
1976     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1977       proc_bind = proc_bind_false;
1978     } else {
1979       if (proc_bind == proc_bind_default) {
1980         // No proc_bind clause specified; use current proc-bind-var for this
1981         // parallel region
1982         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1983       }
1984       /* else: The proc_bind policy was specified explicitly on parallel clause.
1985          This overrides proc-bind-var for this parallel region, but does not
1986          change proc-bind-var. */
1987       // Figure the value of proc-bind-var for the child threads.
1988       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1989           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1990            master_th->th.th_current_task->td_icvs.proc_bind)) {
1991         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1992       }
1993     }
1994 
1995     // Reset for next parallel region
1996     master_th->th.th_set_proc_bind = proc_bind_default;
1997 #endif /* OMP_40_ENABLED */
1998 
1999     if ((nthreads_icv > 0)
2000 #if OMP_40_ENABLED
2001         || (proc_bind_icv != proc_bind_default)
2002 #endif /* OMP_40_ENABLED */
2003             ) {
2004       kmp_internal_control_t new_icvs;
2005       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2006       new_icvs.next = NULL;
2007       if (nthreads_icv > 0) {
2008         new_icvs.nproc = nthreads_icv;
2009       }
2010 
2011 #if OMP_40_ENABLED
2012       if (proc_bind_icv != proc_bind_default) {
2013         new_icvs.proc_bind = proc_bind_icv;
2014       }
2015 #endif /* OMP_40_ENABLED */
2016 
2017       /* allocate a new parallel team */
2018       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2019       team = __kmp_allocate_team(root, nthreads, nthreads,
2020 #if OMPT_SUPPORT
2021                                  ompt_parallel_data,
2022 #endif
2023 #if OMP_40_ENABLED
2024                                  proc_bind,
2025 #endif
2026                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2027     } else {
2028       /* allocate a new parallel team */
2029       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2030       team = __kmp_allocate_team(root, nthreads, nthreads,
2031 #if OMPT_SUPPORT
2032                                  ompt_parallel_data,
2033 #endif
2034 #if OMP_40_ENABLED
2035                                  proc_bind,
2036 #endif
2037                                  &master_th->th.th_current_task->td_icvs,
2038                                  argc USE_NESTED_HOT_ARG(master_th));
2039     }
2040     KF_TRACE(
2041         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2042 
2043     /* setup the new team */
2044     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2045     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2046     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2047     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2048     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2049 #if OMPT_SUPPORT
2050     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2051                           return_address);
2052 #endif
2053     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2054 // TODO: parent_team->t.t_level == INT_MAX ???
2055 #if OMP_40_ENABLED
2056     if (!master_th->th.th_teams_microtask || level > teams_level) {
2057 #endif /* OMP_40_ENABLED */
2058       int new_level = parent_team->t.t_level + 1;
2059       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2060       new_level = parent_team->t.t_active_level + 1;
2061       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2062 #if OMP_40_ENABLED
2063     } else {
2064       // AC: Do not increase parallel level at start of the teams construct
2065       int new_level = parent_team->t.t_level;
2066       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2067       new_level = parent_team->t.t_active_level;
2068       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2069     }
2070 #endif /* OMP_40_ENABLED */
2071     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2072     // set master's schedule as new run-time schedule
2073     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2074 
2075 #if OMP_40_ENABLED
2076     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2077 #endif
2078 
2079     // Update the floating point rounding in the team if required.
2080     propagateFPControl(team);
2081 
2082     if (__kmp_tasking_mode != tskm_immediate_exec) {
2083       // Set master's task team to team's task team. Unless this is hot team, it
2084       // should be NULL.
2085       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2086                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2087       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2088                     "%p, new task_team %p / team %p\n",
2089                     __kmp_gtid_from_thread(master_th),
2090                     master_th->th.th_task_team, parent_team,
2091                     team->t.t_task_team[master_th->th.th_task_state], team));
2092 
2093       if (active_level || master_th->th.th_task_team) {
2094         // Take a memo of master's task_state
2095         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2096         if (master_th->th.th_task_state_top >=
2097             master_th->th.th_task_state_stack_sz) { // increase size
2098           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2099           kmp_uint8 *old_stack, *new_stack;
2100           kmp_uint32 i;
2101           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2102           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2103             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2104           }
2105           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2106                ++i) { // zero-init rest of stack
2107             new_stack[i] = 0;
2108           }
2109           old_stack = master_th->th.th_task_state_memo_stack;
2110           master_th->th.th_task_state_memo_stack = new_stack;
2111           master_th->th.th_task_state_stack_sz = new_size;
2112           __kmp_free(old_stack);
2113         }
2114         // Store master's task_state on stack
2115         master_th->th
2116             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2117             master_th->th.th_task_state;
2118         master_th->th.th_task_state_top++;
2119 #if KMP_NESTED_HOT_TEAMS
2120         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2121           // Restore master's nested state if nested hot team
2122           master_th->th.th_task_state =
2123               master_th->th
2124                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2125         } else {
2126 #endif
2127           master_th->th.th_task_state = 0;
2128 #if KMP_NESTED_HOT_TEAMS
2129         }
2130 #endif
2131       }
2132 #if !KMP_NESTED_HOT_TEAMS
2133       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2134                        (team == root->r.r_hot_team));
2135 #endif
2136     }
2137 
2138     KA_TRACE(
2139         20,
2140         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2141          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2142          team->t.t_nproc));
2143     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2144                      (team->t.t_master_tid == 0 &&
2145                       (team->t.t_parent == root->r.r_root_team ||
2146                        team->t.t_parent->t.t_serialized)));
2147     KMP_MB();
2148 
2149     /* now, setup the arguments */
2150     argv = (void **)team->t.t_argv;
2151 #if OMP_40_ENABLED
2152     if (ap) {
2153 #endif /* OMP_40_ENABLED */
2154       for (i = argc - 1; i >= 0; --i) {
2155 // TODO: revert workaround for Intel(R) 64 tracker #96
2156 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2157         void *new_argv = va_arg(*ap, void *);
2158 #else
2159       void *new_argv = va_arg(ap, void *);
2160 #endif
2161         KMP_CHECK_UPDATE(*argv, new_argv);
2162         argv++;
2163       }
2164 #if OMP_40_ENABLED
2165     } else {
2166       for (i = 0; i < argc; ++i) {
2167         // Get args from parent team for teams construct
2168         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2169       }
2170     }
2171 #endif /* OMP_40_ENABLED */
2172 
2173     /* now actually fork the threads */
2174     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2175     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2176       root->r.r_active = TRUE;
2177 
2178     __kmp_fork_team_threads(root, team, master_th, gtid);
2179     __kmp_setup_icv_copy(team, nthreads,
2180                          &master_th->th.th_current_task->td_icvs, loc);
2181 
2182 #if OMPT_SUPPORT
2183     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2184 #endif
2185 
2186     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2187 
2188 #if USE_ITT_BUILD
2189     if (team->t.t_active_level == 1 // only report frames at level 1
2190 #if OMP_40_ENABLED
2191         && !master_th->th.th_teams_microtask // not in teams construct
2192 #endif /* OMP_40_ENABLED */
2193         ) {
2194 #if USE_ITT_NOTIFY
2195       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2196           (__kmp_forkjoin_frames_mode == 3 ||
2197            __kmp_forkjoin_frames_mode == 1)) {
2198         kmp_uint64 tmp_time = 0;
2199         if (__itt_get_timestamp_ptr)
2200           tmp_time = __itt_get_timestamp();
2201         // Internal fork - report frame begin
2202         master_th->th.th_frame_time = tmp_time;
2203         if (__kmp_forkjoin_frames_mode == 3)
2204           team->t.t_region_time = tmp_time;
2205       } else
2206 // only one notification scheme (either "submit" or "forking/joined", not both)
2207 #endif /* USE_ITT_NOTIFY */
2208           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2209               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2210         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2211         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2212       }
2213     }
2214 #endif /* USE_ITT_BUILD */
2215 
2216     /* now go on and do the work */
2217     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2218     KMP_MB();
2219     KF_TRACE(10,
2220              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2221               root, team, master_th, gtid));
2222 
2223 #if USE_ITT_BUILD
2224     if (__itt_stack_caller_create_ptr) {
2225       team->t.t_stack_id =
2226           __kmp_itt_stack_caller_create(); // create new stack stitching id
2227       // before entering fork barrier
2228     }
2229 #endif /* USE_ITT_BUILD */
2230 
2231 #if OMP_40_ENABLED
2232     // AC: skip __kmp_internal_fork at teams construct, let only master
2233     // threads execute
2234     if (ap)
2235 #endif /* OMP_40_ENABLED */
2236     {
2237       __kmp_internal_fork(loc, gtid, team);
2238       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2239                     "master_th=%p, gtid=%d\n",
2240                     root, team, master_th, gtid));
2241     }
2242 
2243     if (call_context == fork_context_gnu) {
2244       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2245       return TRUE;
2246     }
2247 
2248     /* Invoke microtask for MASTER thread */
2249     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2250                   team->t.t_id, team->t.t_pkfn));
2251   } // END of timer KMP_fork_call block
2252 
2253   {
2254     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2255     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2256     if (!team->t.t_invoke(gtid)) {
2257       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2258     }
2259   }
2260   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2261                 team->t.t_id, team->t.t_pkfn));
2262   KMP_MB(); /* Flush all pending memory write invalidates.  */
2263 
2264   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2265 
2266 #if OMPT_SUPPORT
2267   if (ompt_enabled.enabled) {
2268     master_th->th.ompt_thread_info.state = omp_state_overhead;
2269   }
2270 #endif
2271 
2272   return TRUE;
2273 }
2274 
2275 #if OMPT_SUPPORT
2276 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2277                                             kmp_team_t *team) {
2278   // restore state outside the region
2279   thread->th.ompt_thread_info.state =
2280       ((team->t.t_serialized) ? omp_state_work_serial
2281                               : omp_state_work_parallel);
2282 }
2283 
2284 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2285                                    kmp_team_t *team, ompt_data_t *parallel_data,
2286                                    fork_context_e fork_context, void *codeptr) {
2287   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2288   if (ompt_enabled.ompt_callback_parallel_end) {
2289     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2290         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2291         codeptr);
2292   }
2293 
2294   task_info->frame.enter_frame = NULL;
2295   __kmp_join_restore_state(thread, team);
2296 }
2297 #endif
2298 
2299 void __kmp_join_call(ident_t *loc, int gtid
2300 #if OMPT_SUPPORT
2301                      ,
2302                      enum fork_context_e fork_context
2303 #endif
2304 #if OMP_40_ENABLED
2305                      ,
2306                      int exit_teams
2307 #endif /* OMP_40_ENABLED */
2308                      ) {
2309   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2310   kmp_team_t *team;
2311   kmp_team_t *parent_team;
2312   kmp_info_t *master_th;
2313   kmp_root_t *root;
2314   int master_active;
2315   int i;
2316 
2317   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2318 
2319   /* setup current data */
2320   master_th = __kmp_threads[gtid];
2321   root = master_th->th.th_root;
2322   team = master_th->th.th_team;
2323   parent_team = team->t.t_parent;
2324 
2325   master_th->th.th_ident = loc;
2326 
2327 #if OMPT_SUPPORT
2328   if (ompt_enabled.enabled) {
2329     master_th->th.ompt_thread_info.state = omp_state_overhead;
2330   }
2331 #endif
2332 
2333 #if KMP_DEBUG
2334   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2335     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2336                   "th_task_team = %p\n",
2337                   __kmp_gtid_from_thread(master_th), team,
2338                   team->t.t_task_team[master_th->th.th_task_state],
2339                   master_th->th.th_task_team));
2340     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2341                      team->t.t_task_team[master_th->th.th_task_state]);
2342   }
2343 #endif
2344 
2345   if (team->t.t_serialized) {
2346 #if OMP_40_ENABLED
2347     if (master_th->th.th_teams_microtask) {
2348       // We are in teams construct
2349       int level = team->t.t_level;
2350       int tlevel = master_th->th.th_teams_level;
2351       if (level == tlevel) {
2352         // AC: we haven't incremented it earlier at start of teams construct,
2353         //     so do it here - at the end of teams construct
2354         team->t.t_level++;
2355       } else if (level == tlevel + 1) {
2356         // AC: we are exiting parallel inside teams, need to increment
2357         // serialization in order to restore it in the next call to
2358         // __kmpc_end_serialized_parallel
2359         team->t.t_serialized++;
2360       }
2361     }
2362 #endif /* OMP_40_ENABLED */
2363     __kmpc_end_serialized_parallel(loc, gtid);
2364 
2365 #if OMPT_SUPPORT
2366     if (ompt_enabled.enabled) {
2367       __kmp_join_restore_state(master_th, parent_team);
2368     }
2369 #endif
2370 
2371     return;
2372   }
2373 
2374   master_active = team->t.t_master_active;
2375 
2376 #if OMP_40_ENABLED
2377   if (!exit_teams)
2378 #endif /* OMP_40_ENABLED */
2379   {
2380     // AC: No barrier for internal teams at exit from teams construct.
2381     //     But there is barrier for external team (league).
2382     __kmp_internal_join(loc, gtid, team);
2383   }
2384 #if OMP_40_ENABLED
2385   else {
2386     master_th->th.th_task_state =
2387         0; // AC: no tasking in teams (out of any parallel)
2388   }
2389 #endif /* OMP_40_ENABLED */
2390 
2391   KMP_MB();
2392 
2393 #if OMPT_SUPPORT
2394   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2395   void *codeptr = team->t.ompt_team_info.master_return_address;
2396 #endif
2397 
2398 #if USE_ITT_BUILD
2399   if (__itt_stack_caller_create_ptr) {
2400     __kmp_itt_stack_caller_destroy(
2401         (__itt_caller)team->t
2402             .t_stack_id); // destroy the stack stitching id after join barrier
2403   }
2404 
2405   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2406   if (team->t.t_active_level == 1
2407 #if OMP_40_ENABLED
2408       && !master_th->th.th_teams_microtask /* not in teams construct */
2409 #endif /* OMP_40_ENABLED */
2410       ) {
2411     master_th->th.th_ident = loc;
2412     // only one notification scheme (either "submit" or "forking/joined", not
2413     // both)
2414     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2415         __kmp_forkjoin_frames_mode == 3)
2416       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2417                              master_th->th.th_frame_time, 0, loc,
2418                              master_th->th.th_team_nproc, 1);
2419     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2420              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2421       __kmp_itt_region_joined(gtid);
2422   } // active_level == 1
2423 #endif /* USE_ITT_BUILD */
2424 
2425 #if OMP_40_ENABLED
2426   if (master_th->th.th_teams_microtask && !exit_teams &&
2427       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2428       team->t.t_level == master_th->th.th_teams_level + 1) {
2429     // AC: We need to leave the team structure intact at the end of parallel
2430     // inside the teams construct, so that at the next parallel same (hot) team
2431     // works, only adjust nesting levels
2432 
2433     /* Decrement our nested depth level */
2434     team->t.t_level--;
2435     team->t.t_active_level--;
2436     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2437 
2438     /* Restore number of threads in the team if needed */
2439     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2440       int old_num = master_th->th.th_team_nproc;
2441       int new_num = master_th->th.th_teams_size.nth;
2442       kmp_info_t **other_threads = team->t.t_threads;
2443       team->t.t_nproc = new_num;
2444       for (i = 0; i < old_num; ++i) {
2445         other_threads[i]->th.th_team_nproc = new_num;
2446       }
2447       // Adjust states of non-used threads of the team
2448       for (i = old_num; i < new_num; ++i) {
2449         // Re-initialize thread's barrier data.
2450         int b;
2451         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2452         for (b = 0; b < bs_last_barrier; ++b) {
2453           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2454           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2455 #if USE_DEBUGGER
2456           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2457 #endif
2458         }
2459         if (__kmp_tasking_mode != tskm_immediate_exec) {
2460           // Synchronize thread's task state
2461           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2462         }
2463       }
2464     }
2465 
2466 #if OMPT_SUPPORT
2467     if (ompt_enabled.enabled) {
2468       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2469                       codeptr);
2470     }
2471 #endif
2472 
2473     return;
2474   }
2475 #endif /* OMP_40_ENABLED */
2476 
2477   /* do cleanup and restore the parent team */
2478   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2479   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2480 
2481   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2482 
2483   /* jc: The following lock has instructions with REL and ACQ semantics,
2484      separating the parallel user code called in this parallel region
2485      from the serial user code called after this function returns. */
2486   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2487 
2488 #if OMP_40_ENABLED
2489   if (!master_th->th.th_teams_microtask ||
2490       team->t.t_level > master_th->th.th_teams_level)
2491 #endif /* OMP_40_ENABLED */
2492   {
2493     /* Decrement our nested depth level */
2494     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2495   }
2496   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2497 
2498 #if OMPT_SUPPORT
2499   if (ompt_enabled.enabled) {
2500     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2501     if (ompt_enabled.ompt_callback_implicit_task) {
2502       int ompt_team_size = team->t.t_nproc;
2503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2504           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2505           OMPT_CUR_TASK_INFO(master_th)->thread_num);
2506     }
2507 
2508     task_info->frame.exit_frame = NULL;
2509     task_info->task_data = ompt_data_none;
2510   }
2511 #endif
2512 
2513   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2514                 master_th, team));
2515   __kmp_pop_current_task_from_thread(master_th);
2516 
2517 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2518   // Restore master thread's partition.
2519   master_th->th.th_first_place = team->t.t_first_place;
2520   master_th->th.th_last_place = team->t.t_last_place;
2521 #endif /* OMP_40_ENABLED */
2522 
2523   updateHWFPControl(team);
2524 
2525   if (root->r.r_active != master_active)
2526     root->r.r_active = master_active;
2527 
2528   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2529                             master_th)); // this will free worker threads
2530 
2531   /* this race was fun to find. make sure the following is in the critical
2532      region otherwise assertions may fail occasionally since the old team may be
2533      reallocated and the hierarchy appears inconsistent. it is actually safe to
2534      run and won't cause any bugs, but will cause those assertion failures. it's
2535      only one deref&assign so might as well put this in the critical region */
2536   master_th->th.th_team = parent_team;
2537   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2538   master_th->th.th_team_master = parent_team->t.t_threads[0];
2539   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2540 
2541   /* restore serialized team, if need be */
2542   if (parent_team->t.t_serialized &&
2543       parent_team != master_th->th.th_serial_team &&
2544       parent_team != root->r.r_root_team) {
2545     __kmp_free_team(root,
2546                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2547     master_th->th.th_serial_team = parent_team;
2548   }
2549 
2550   if (__kmp_tasking_mode != tskm_immediate_exec) {
2551     if (master_th->th.th_task_state_top >
2552         0) { // Restore task state from memo stack
2553       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2554       // Remember master's state if we re-use this nested hot team
2555       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2556           master_th->th.th_task_state;
2557       --master_th->th.th_task_state_top; // pop
2558       // Now restore state at this level
2559       master_th->th.th_task_state =
2560           master_th->th
2561               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2562     }
2563     // Copy the task team from the parent team to the master thread
2564     master_th->th.th_task_team =
2565         parent_team->t.t_task_team[master_th->th.th_task_state];
2566     KA_TRACE(20,
2567              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2568               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2569               parent_team));
2570   }
2571 
2572   // TODO: GEH - cannot do this assertion because root thread not set up as
2573   // executing
2574   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2575   master_th->th.th_current_task->td_flags.executing = 1;
2576 
2577   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2578 
2579 #if OMPT_SUPPORT
2580   if (ompt_enabled.enabled) {
2581     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2582                     codeptr);
2583   }
2584 #endif
2585 
2586   KMP_MB();
2587   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2588 }
2589 
2590 /* Check whether we should push an internal control record onto the
2591    serial team stack.  If so, do it.  */
2592 void __kmp_save_internal_controls(kmp_info_t *thread) {
2593 
2594   if (thread->th.th_team != thread->th.th_serial_team) {
2595     return;
2596   }
2597   if (thread->th.th_team->t.t_serialized > 1) {
2598     int push = 0;
2599 
2600     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2601       push = 1;
2602     } else {
2603       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2604           thread->th.th_team->t.t_serialized) {
2605         push = 1;
2606       }
2607     }
2608     if (push) { /* push a record on the serial team's stack */
2609       kmp_internal_control_t *control =
2610           (kmp_internal_control_t *)__kmp_allocate(
2611               sizeof(kmp_internal_control_t));
2612 
2613       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2614 
2615       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2616 
2617       control->next = thread->th.th_team->t.t_control_stack_top;
2618       thread->th.th_team->t.t_control_stack_top = control;
2619     }
2620   }
2621 }
2622 
2623 /* Changes set_nproc */
2624 void __kmp_set_num_threads(int new_nth, int gtid) {
2625   kmp_info_t *thread;
2626   kmp_root_t *root;
2627 
2628   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2629   KMP_DEBUG_ASSERT(__kmp_init_serial);
2630 
2631   if (new_nth < 1)
2632     new_nth = 1;
2633   else if (new_nth > __kmp_max_nth)
2634     new_nth = __kmp_max_nth;
2635 
2636   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2637   thread = __kmp_threads[gtid];
2638 
2639   __kmp_save_internal_controls(thread);
2640 
2641   set__nproc(thread, new_nth);
2642 
2643   // If this omp_set_num_threads() call will cause the hot team size to be
2644   // reduced (in the absence of a num_threads clause), then reduce it now,
2645   // rather than waiting for the next parallel region.
2646   root = thread->th.th_root;
2647   if (__kmp_init_parallel && (!root->r.r_active) &&
2648       (root->r.r_hot_team->t.t_nproc > new_nth)
2649 #if KMP_NESTED_HOT_TEAMS
2650       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2651 #endif
2652       ) {
2653     kmp_team_t *hot_team = root->r.r_hot_team;
2654     int f;
2655 
2656     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2657 
2658     // Release the extra threads we don't need any more.
2659     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2660       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2661       if (__kmp_tasking_mode != tskm_immediate_exec) {
2662         // When decreasing team size, threads no longer in the team should unref
2663         // task team.
2664         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2665       }
2666       __kmp_free_thread(hot_team->t.t_threads[f]);
2667       hot_team->t.t_threads[f] = NULL;
2668     }
2669     hot_team->t.t_nproc = new_nth;
2670 #if KMP_NESTED_HOT_TEAMS
2671     if (thread->th.th_hot_teams) {
2672       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2673       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2674     }
2675 #endif
2676 
2677     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2678 
2679     // Update the t_nproc field in the threads that are still active.
2680     for (f = 0; f < new_nth; f++) {
2681       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2682       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2683     }
2684     // Special flag in case omp_set_num_threads() call
2685     hot_team->t.t_size_changed = -1;
2686   }
2687 }
2688 
2689 /* Changes max_active_levels */
2690 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2691   kmp_info_t *thread;
2692 
2693   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2694                 "%d = (%d)\n",
2695                 gtid, max_active_levels));
2696   KMP_DEBUG_ASSERT(__kmp_init_serial);
2697 
2698   // validate max_active_levels
2699   if (max_active_levels < 0) {
2700     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2701     // We ignore this call if the user has specified a negative value.
2702     // The current setting won't be changed. The last valid setting will be
2703     // used. A warning will be issued (if warnings are allowed as controlled by
2704     // the KMP_WARNINGS env var).
2705     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2706                   "max_active_levels for thread %d = (%d)\n",
2707                   gtid, max_active_levels));
2708     return;
2709   }
2710   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2711     // it's OK, the max_active_levels is within the valid range: [ 0;
2712     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2713     // We allow a zero value. (implementation defined behavior)
2714   } else {
2715     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2716                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2717     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2718     // Current upper limit is MAX_INT. (implementation defined behavior)
2719     // If the input exceeds the upper limit, we correct the input to be the
2720     // upper limit. (implementation defined behavior)
2721     // Actually, the flow should never get here until we use MAX_INT limit.
2722   }
2723   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2724                 "max_active_levels for thread %d = (%d)\n",
2725                 gtid, max_active_levels));
2726 
2727   thread = __kmp_threads[gtid];
2728 
2729   __kmp_save_internal_controls(thread);
2730 
2731   set__max_active_levels(thread, max_active_levels);
2732 }
2733 
2734 /* Gets max_active_levels */
2735 int __kmp_get_max_active_levels(int gtid) {
2736   kmp_info_t *thread;
2737 
2738   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2739   KMP_DEBUG_ASSERT(__kmp_init_serial);
2740 
2741   thread = __kmp_threads[gtid];
2742   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2743   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2744                 "curtask_maxaclevel=%d\n",
2745                 gtid, thread->th.th_current_task,
2746                 thread->th.th_current_task->td_icvs.max_active_levels));
2747   return thread->th.th_current_task->td_icvs.max_active_levels;
2748 }
2749 
2750 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2751 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2752   kmp_info_t *thread;
2753   //    kmp_team_t *team;
2754 
2755   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2756                 gtid, (int)kind, chunk));
2757   KMP_DEBUG_ASSERT(__kmp_init_serial);
2758 
2759   // Check if the kind parameter is valid, correct if needed.
2760   // Valid parameters should fit in one of two intervals - standard or extended:
2761   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2762   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2763   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2764       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2765     // TODO: Hint needs attention in case we change the default schedule.
2766     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2767               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2768               __kmp_msg_null);
2769     kind = kmp_sched_default;
2770     chunk = 0; // ignore chunk value in case of bad kind
2771   }
2772 
2773   thread = __kmp_threads[gtid];
2774 
2775   __kmp_save_internal_controls(thread);
2776 
2777   if (kind < kmp_sched_upper_std) {
2778     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2779       // differ static chunked vs. unchunked:  chunk should be invalid to
2780       // indicate unchunked schedule (which is the default)
2781       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2782     } else {
2783       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2784           __kmp_sch_map[kind - kmp_sched_lower - 1];
2785     }
2786   } else {
2787     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2788     //    kmp_sched_lower - 2 ];
2789     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2790         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2791                       kmp_sched_lower - 2];
2792   }
2793   if (kind == kmp_sched_auto || chunk < 1) {
2794     // ignore parameter chunk for schedule auto
2795     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2796   } else {
2797     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2798   }
2799 }
2800 
2801 /* Gets def_sched_var ICV values */
2802 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2803   kmp_info_t *thread;
2804   enum sched_type th_type;
2805 
2806   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2807   KMP_DEBUG_ASSERT(__kmp_init_serial);
2808 
2809   thread = __kmp_threads[gtid];
2810 
2811   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2812 
2813   switch (th_type) {
2814   case kmp_sch_static:
2815   case kmp_sch_static_greedy:
2816   case kmp_sch_static_balanced:
2817     *kind = kmp_sched_static;
2818     *chunk = 0; // chunk was not set, try to show this fact via zero value
2819     return;
2820   case kmp_sch_static_chunked:
2821     *kind = kmp_sched_static;
2822     break;
2823   case kmp_sch_dynamic_chunked:
2824     *kind = kmp_sched_dynamic;
2825     break;
2826   case kmp_sch_guided_chunked:
2827   case kmp_sch_guided_iterative_chunked:
2828   case kmp_sch_guided_analytical_chunked:
2829     *kind = kmp_sched_guided;
2830     break;
2831   case kmp_sch_auto:
2832     *kind = kmp_sched_auto;
2833     break;
2834   case kmp_sch_trapezoidal:
2835     *kind = kmp_sched_trapezoidal;
2836     break;
2837 #if KMP_STATIC_STEAL_ENABLED
2838   case kmp_sch_static_steal:
2839     *kind = kmp_sched_static_steal;
2840     break;
2841 #endif
2842   default:
2843     KMP_FATAL(UnknownSchedulingType, th_type);
2844   }
2845 
2846   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2847 }
2848 
2849 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2850 
2851   int ii, dd;
2852   kmp_team_t *team;
2853   kmp_info_t *thr;
2854 
2855   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2856   KMP_DEBUG_ASSERT(__kmp_init_serial);
2857 
2858   // validate level
2859   if (level == 0)
2860     return 0;
2861   if (level < 0)
2862     return -1;
2863   thr = __kmp_threads[gtid];
2864   team = thr->th.th_team;
2865   ii = team->t.t_level;
2866   if (level > ii)
2867     return -1;
2868 
2869 #if OMP_40_ENABLED
2870   if (thr->th.th_teams_microtask) {
2871     // AC: we are in teams region where multiple nested teams have same level
2872     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2873     if (level <=
2874         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2875       KMP_DEBUG_ASSERT(ii >= tlevel);
2876       // AC: As we need to pass by the teams league, we need to artificially
2877       // increase ii
2878       if (ii == tlevel) {
2879         ii += 2; // three teams have same level
2880       } else {
2881         ii++; // two teams have same level
2882       }
2883     }
2884   }
2885 #endif
2886 
2887   if (ii == level)
2888     return __kmp_tid_from_gtid(gtid);
2889 
2890   dd = team->t.t_serialized;
2891   level++;
2892   while (ii > level) {
2893     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2894     }
2895     if ((team->t.t_serialized) && (!dd)) {
2896       team = team->t.t_parent;
2897       continue;
2898     }
2899     if (ii > level) {
2900       team = team->t.t_parent;
2901       dd = team->t.t_serialized;
2902       ii--;
2903     }
2904   }
2905 
2906   return (dd > 1) ? (0) : (team->t.t_master_tid);
2907 }
2908 
2909 int __kmp_get_team_size(int gtid, int level) {
2910 
2911   int ii, dd;
2912   kmp_team_t *team;
2913   kmp_info_t *thr;
2914 
2915   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2916   KMP_DEBUG_ASSERT(__kmp_init_serial);
2917 
2918   // validate level
2919   if (level == 0)
2920     return 1;
2921   if (level < 0)
2922     return -1;
2923   thr = __kmp_threads[gtid];
2924   team = thr->th.th_team;
2925   ii = team->t.t_level;
2926   if (level > ii)
2927     return -1;
2928 
2929 #if OMP_40_ENABLED
2930   if (thr->th.th_teams_microtask) {
2931     // AC: we are in teams region where multiple nested teams have same level
2932     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2933     if (level <=
2934         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2935       KMP_DEBUG_ASSERT(ii >= tlevel);
2936       // AC: As we need to pass by the teams league, we need to artificially
2937       // increase ii
2938       if (ii == tlevel) {
2939         ii += 2; // three teams have same level
2940       } else {
2941         ii++; // two teams have same level
2942       }
2943     }
2944   }
2945 #endif
2946 
2947   while (ii > level) {
2948     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2949     }
2950     if (team->t.t_serialized && (!dd)) {
2951       team = team->t.t_parent;
2952       continue;
2953     }
2954     if (ii > level) {
2955       team = team->t.t_parent;
2956       ii--;
2957     }
2958   }
2959 
2960   return team->t.t_nproc;
2961 }
2962 
2963 kmp_r_sched_t __kmp_get_schedule_global() {
2964   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2965   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2966   // independently. So one can get the updated schedule here.
2967 
2968   kmp_r_sched_t r_sched;
2969 
2970   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2971   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2972   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2973   // different roots (even in OMP 2.5)
2974   if (__kmp_sched == kmp_sch_static) {
2975     // replace STATIC with more detailed schedule (balanced or greedy)
2976     r_sched.r_sched_type = __kmp_static;
2977   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2978     // replace GUIDED with more detailed schedule (iterative or analytical)
2979     r_sched.r_sched_type = __kmp_guided;
2980   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2981     r_sched.r_sched_type = __kmp_sched;
2982   }
2983 
2984   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2985     // __kmp_chunk may be wrong here (if it was not ever set)
2986     r_sched.chunk = KMP_DEFAULT_CHUNK;
2987   } else {
2988     r_sched.chunk = __kmp_chunk;
2989   }
2990 
2991   return r_sched;
2992 }
2993 
2994 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2995    at least argc number of *t_argv entries for the requested team. */
2996 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2997 
2998   KMP_DEBUG_ASSERT(team);
2999   if (!realloc || argc > team->t.t_max_argc) {
3000 
3001     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3002                    "current entries=%d\n",
3003                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3004     /* if previously allocated heap space for args, free them */
3005     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3006       __kmp_free((void *)team->t.t_argv);
3007 
3008     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3009       /* use unused space in the cache line for arguments */
3010       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3011       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3012                      "argv entries\n",
3013                      team->t.t_id, team->t.t_max_argc));
3014       team->t.t_argv = &team->t.t_inline_argv[0];
3015       if (__kmp_storage_map) {
3016         __kmp_print_storage_map_gtid(
3017             -1, &team->t.t_inline_argv[0],
3018             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3019             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3020             team->t.t_id);
3021       }
3022     } else {
3023       /* allocate space for arguments in the heap */
3024       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3025                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3026                                : 2 * argc;
3027       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3028                      "argv entries\n",
3029                      team->t.t_id, team->t.t_max_argc));
3030       team->t.t_argv =
3031           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3032       if (__kmp_storage_map) {
3033         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3034                                      &team->t.t_argv[team->t.t_max_argc],
3035                                      sizeof(void *) * team->t.t_max_argc,
3036                                      "team_%d.t_argv", team->t.t_id);
3037       }
3038     }
3039   }
3040 }
3041 
3042 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3043   int i;
3044   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3045   team->t.t_threads =
3046       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3047   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3048       sizeof(dispatch_shared_info_t) * num_disp_buff);
3049   team->t.t_dispatch =
3050       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3051   team->t.t_implicit_task_taskdata =
3052       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3053   team->t.t_max_nproc = max_nth;
3054 
3055   /* setup dispatch buffers */
3056   for (i = 0; i < num_disp_buff; ++i) {
3057     team->t.t_disp_buffer[i].buffer_index = i;
3058 #if OMP_45_ENABLED
3059     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3060 #endif
3061   }
3062 }
3063 
3064 static void __kmp_free_team_arrays(kmp_team_t *team) {
3065   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3066   int i;
3067   for (i = 0; i < team->t.t_max_nproc; ++i) {
3068     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3069       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3070       team->t.t_dispatch[i].th_disp_buffer = NULL;
3071     }
3072   }
3073   __kmp_free(team->t.t_threads);
3074   __kmp_free(team->t.t_disp_buffer);
3075   __kmp_free(team->t.t_dispatch);
3076   __kmp_free(team->t.t_implicit_task_taskdata);
3077   team->t.t_threads = NULL;
3078   team->t.t_disp_buffer = NULL;
3079   team->t.t_dispatch = NULL;
3080   team->t.t_implicit_task_taskdata = 0;
3081 }
3082 
3083 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3084   kmp_info_t **oldThreads = team->t.t_threads;
3085 
3086   __kmp_free(team->t.t_disp_buffer);
3087   __kmp_free(team->t.t_dispatch);
3088   __kmp_free(team->t.t_implicit_task_taskdata);
3089   __kmp_allocate_team_arrays(team, max_nth);
3090 
3091   KMP_MEMCPY(team->t.t_threads, oldThreads,
3092              team->t.t_nproc * sizeof(kmp_info_t *));
3093 
3094   __kmp_free(oldThreads);
3095 }
3096 
3097 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3098 
3099   kmp_r_sched_t r_sched =
3100       __kmp_get_schedule_global(); // get current state of scheduling globals
3101 
3102 #if OMP_40_ENABLED
3103   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3104 #endif /* OMP_40_ENABLED */
3105 
3106   kmp_internal_control_t g_icvs = {
3107     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3108     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3109     // for nested parallelism (per thread)
3110     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3111     // adjustment of threads (per thread)
3112     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3113     // whether blocktime is explicitly set
3114     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3115 #if KMP_USE_MONITOR
3116     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3117 // intervals
3118 #endif
3119     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3120     // next parallel region (per thread)
3121     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3122     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3123     // for max_active_levels
3124     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3125 // {sched,chunk} pair
3126 #if OMP_40_ENABLED
3127     __kmp_nested_proc_bind.bind_types[0],
3128     __kmp_default_device,
3129 #endif /* OMP_40_ENABLED */
3130     NULL // struct kmp_internal_control *next;
3131   };
3132 
3133   return g_icvs;
3134 }
3135 
3136 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3137 
3138   kmp_internal_control_t gx_icvs;
3139   gx_icvs.serial_nesting_level =
3140       0; // probably =team->t.t_serial like in save_inter_controls
3141   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3142   gx_icvs.next = NULL;
3143 
3144   return gx_icvs;
3145 }
3146 
3147 static void __kmp_initialize_root(kmp_root_t *root) {
3148   int f;
3149   kmp_team_t *root_team;
3150   kmp_team_t *hot_team;
3151   int hot_team_max_nth;
3152   kmp_r_sched_t r_sched =
3153       __kmp_get_schedule_global(); // get current state of scheduling globals
3154   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3155   KMP_DEBUG_ASSERT(root);
3156   KMP_ASSERT(!root->r.r_begin);
3157 
3158   /* setup the root state structure */
3159   __kmp_init_lock(&root->r.r_begin_lock);
3160   root->r.r_begin = FALSE;
3161   root->r.r_active = FALSE;
3162   root->r.r_in_parallel = 0;
3163   root->r.r_blocktime = __kmp_dflt_blocktime;
3164   root->r.r_nested = __kmp_dflt_nested;
3165   root->r.r_cg_nthreads = 1;
3166 
3167   /* setup the root team for this task */
3168   /* allocate the root team structure */
3169   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3170 
3171   root_team =
3172       __kmp_allocate_team(root,
3173                           1, // new_nproc
3174                           1, // max_nproc
3175 #if OMPT_SUPPORT
3176                           ompt_data_none, // root parallel id
3177 #endif
3178 #if OMP_40_ENABLED
3179                           __kmp_nested_proc_bind.bind_types[0],
3180 #endif
3181                           &r_icvs,
3182                           0 // argc
3183                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3184                           );
3185 #if USE_DEBUGGER
3186   // Non-NULL value should be assigned to make the debugger display the root
3187   // team.
3188   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3189 #endif
3190 
3191   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3192 
3193   root->r.r_root_team = root_team;
3194   root_team->t.t_control_stack_top = NULL;
3195 
3196   /* initialize root team */
3197   root_team->t.t_threads[0] = NULL;
3198   root_team->t.t_nproc = 1;
3199   root_team->t.t_serialized = 1;
3200   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3201   root_team->t.t_sched.sched = r_sched.sched;
3202   KA_TRACE(
3203       20,
3204       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3205        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3206 
3207   /* setup the  hot team for this task */
3208   /* allocate the hot team structure */
3209   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3210 
3211   hot_team =
3212       __kmp_allocate_team(root,
3213                           1, // new_nproc
3214                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3215 #if OMPT_SUPPORT
3216                           ompt_data_none, // root parallel id
3217 #endif
3218 #if OMP_40_ENABLED
3219                           __kmp_nested_proc_bind.bind_types[0],
3220 #endif
3221                           &r_icvs,
3222                           0 // argc
3223                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3224                           );
3225   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3226 
3227   root->r.r_hot_team = hot_team;
3228   root_team->t.t_control_stack_top = NULL;
3229 
3230   /* first-time initialization */
3231   hot_team->t.t_parent = root_team;
3232 
3233   /* initialize hot team */
3234   hot_team_max_nth = hot_team->t.t_max_nproc;
3235   for (f = 0; f < hot_team_max_nth; ++f) {
3236     hot_team->t.t_threads[f] = NULL;
3237   }
3238   hot_team->t.t_nproc = 1;
3239   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3240   hot_team->t.t_sched.sched = r_sched.sched;
3241   hot_team->t.t_size_changed = 0;
3242 }
3243 
3244 #ifdef KMP_DEBUG
3245 
3246 typedef struct kmp_team_list_item {
3247   kmp_team_p const *entry;
3248   struct kmp_team_list_item *next;
3249 } kmp_team_list_item_t;
3250 typedef kmp_team_list_item_t *kmp_team_list_t;
3251 
3252 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3253     kmp_team_list_t list, // List of teams.
3254     kmp_team_p const *team // Team to add.
3255     ) {
3256 
3257   // List must terminate with item where both entry and next are NULL.
3258   // Team is added to the list only once.
3259   // List is sorted in ascending order by team id.
3260   // Team id is *not* a key.
3261 
3262   kmp_team_list_t l;
3263 
3264   KMP_DEBUG_ASSERT(list != NULL);
3265   if (team == NULL) {
3266     return;
3267   }
3268 
3269   __kmp_print_structure_team_accum(list, team->t.t_parent);
3270   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3271 
3272   // Search list for the team.
3273   l = list;
3274   while (l->next != NULL && l->entry != team) {
3275     l = l->next;
3276   }
3277   if (l->next != NULL) {
3278     return; // Team has been added before, exit.
3279   }
3280 
3281   // Team is not found. Search list again for insertion point.
3282   l = list;
3283   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3284     l = l->next;
3285   }
3286 
3287   // Insert team.
3288   {
3289     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3290         sizeof(kmp_team_list_item_t));
3291     *item = *l;
3292     l->entry = team;
3293     l->next = item;
3294   }
3295 }
3296 
3297 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3298 
3299                                        ) {
3300   __kmp_printf("%s", title);
3301   if (team != NULL) {
3302     __kmp_printf("%2x %p\n", team->t.t_id, team);
3303   } else {
3304     __kmp_printf(" - (nil)\n");
3305   }
3306 }
3307 
3308 static void __kmp_print_structure_thread(char const *title,
3309                                          kmp_info_p const *thread) {
3310   __kmp_printf("%s", title);
3311   if (thread != NULL) {
3312     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3313   } else {
3314     __kmp_printf(" - (nil)\n");
3315   }
3316 }
3317 
3318 void __kmp_print_structure(void) {
3319 
3320   kmp_team_list_t list;
3321 
3322   // Initialize list of teams.
3323   list =
3324       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3325   list->entry = NULL;
3326   list->next = NULL;
3327 
3328   __kmp_printf("\n------------------------------\nGlobal Thread "
3329                "Table\n------------------------------\n");
3330   {
3331     int gtid;
3332     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3333       __kmp_printf("%2d", gtid);
3334       if (__kmp_threads != NULL) {
3335         __kmp_printf(" %p", __kmp_threads[gtid]);
3336       }
3337       if (__kmp_root != NULL) {
3338         __kmp_printf(" %p", __kmp_root[gtid]);
3339       }
3340       __kmp_printf("\n");
3341     }
3342   }
3343 
3344   // Print out __kmp_threads array.
3345   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3346                "----------\n");
3347   if (__kmp_threads != NULL) {
3348     int gtid;
3349     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3350       kmp_info_t const *thread = __kmp_threads[gtid];
3351       if (thread != NULL) {
3352         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3353         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3354         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3355         __kmp_print_structure_team("    Serial Team:  ",
3356                                    thread->th.th_serial_team);
3357         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3358         __kmp_print_structure_thread("    Master:       ",
3359                                      thread->th.th_team_master);
3360         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3361         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3362 #if OMP_40_ENABLED
3363         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3364 #endif
3365         __kmp_print_structure_thread("    Next in pool: ",
3366                                      thread->th.th_next_pool);
3367         __kmp_printf("\n");
3368         __kmp_print_structure_team_accum(list, thread->th.th_team);
3369         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3370       }
3371     }
3372   } else {
3373     __kmp_printf("Threads array is not allocated.\n");
3374   }
3375 
3376   // Print out __kmp_root array.
3377   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3378                "--------\n");
3379   if (__kmp_root != NULL) {
3380     int gtid;
3381     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3382       kmp_root_t const *root = __kmp_root[gtid];
3383       if (root != NULL) {
3384         __kmp_printf("GTID %2d %p:\n", gtid, root);
3385         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3386         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3387         __kmp_print_structure_thread("    Uber Thread:  ",
3388                                      root->r.r_uber_thread);
3389         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3390         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3391         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3392         __kmp_printf("\n");
3393         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3394         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3395       }
3396     }
3397   } else {
3398     __kmp_printf("Ubers array is not allocated.\n");
3399   }
3400 
3401   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3402                "--------\n");
3403   while (list->next != NULL) {
3404     kmp_team_p const *team = list->entry;
3405     int i;
3406     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3407     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3408     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3409     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3410     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3411     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3412     for (i = 0; i < team->t.t_nproc; ++i) {
3413       __kmp_printf("    Thread %2d:      ", i);
3414       __kmp_print_structure_thread("", team->t.t_threads[i]);
3415     }
3416     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3417     __kmp_printf("\n");
3418     list = list->next;
3419   }
3420 
3421   // Print out __kmp_thread_pool and __kmp_team_pool.
3422   __kmp_printf("\n------------------------------\nPools\n----------------------"
3423                "--------\n");
3424   __kmp_print_structure_thread("Thread pool:          ",
3425                                CCAST(kmp_info_t *, __kmp_thread_pool));
3426   __kmp_print_structure_team("Team pool:            ",
3427                              CCAST(kmp_team_t *, __kmp_team_pool));
3428   __kmp_printf("\n");
3429 
3430   // Free team list.
3431   while (list != NULL) {
3432     kmp_team_list_item_t *item = list;
3433     list = list->next;
3434     KMP_INTERNAL_FREE(item);
3435   }
3436 }
3437 
3438 #endif
3439 
3440 //---------------------------------------------------------------------------
3441 //  Stuff for per-thread fast random number generator
3442 //  Table of primes
3443 static const unsigned __kmp_primes[] = {
3444     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3445     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3446     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3447     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3448     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3449     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3450     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3451     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3452     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3453     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3454     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3455 
3456 //---------------------------------------------------------------------------
3457 //  __kmp_get_random: Get a random number using a linear congruential method.
3458 unsigned short __kmp_get_random(kmp_info_t *thread) {
3459   unsigned x = thread->th.th_x;
3460   unsigned short r = x >> 16;
3461 
3462   thread->th.th_x = x * thread->th.th_a + 1;
3463 
3464   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3465                 thread->th.th_info.ds.ds_tid, r));
3466 
3467   return r;
3468 }
3469 //--------------------------------------------------------
3470 // __kmp_init_random: Initialize a random number generator
3471 void __kmp_init_random(kmp_info_t *thread) {
3472   unsigned seed = thread->th.th_info.ds.ds_tid;
3473 
3474   thread->th.th_a =
3475       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3476   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3477   KA_TRACE(30,
3478            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3479 }
3480 
3481 #if KMP_OS_WINDOWS
3482 /* reclaim array entries for root threads that are already dead, returns number
3483  * reclaimed */
3484 static int __kmp_reclaim_dead_roots(void) {
3485   int i, r = 0;
3486 
3487   for (i = 0; i < __kmp_threads_capacity; ++i) {
3488     if (KMP_UBER_GTID(i) &&
3489         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3490         !__kmp_root[i]
3491              ->r.r_active) { // AC: reclaim only roots died in non-active state
3492       r += __kmp_unregister_root_other_thread(i);
3493     }
3494   }
3495   return r;
3496 }
3497 #endif
3498 
3499 /* This function attempts to create free entries in __kmp_threads and
3500    __kmp_root, and returns the number of free entries generated.
3501 
3502    For Windows* OS static library, the first mechanism used is to reclaim array
3503    entries for root threads that are already dead.
3504 
3505    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3506    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3507    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3508    threadprivate cache array has been created. Synchronization with
3509    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3510 
3511    After any dead root reclamation, if the clipping value allows array expansion
3512    to result in the generation of a total of nNeed free slots, the function does
3513    that expansion. If not, nothing is done beyond the possible initial root
3514    thread reclamation.
3515 
3516    If any argument is negative, the behavior is undefined. */
3517 static int __kmp_expand_threads(int nNeed) {
3518   int added = 0;
3519   int minimumRequiredCapacity;
3520   int newCapacity;
3521   kmp_info_t **newThreads;
3522   kmp_root_t **newRoot;
3523 
3524 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3525 // resizing __kmp_threads does not need additional protection if foreign
3526 // threads are present
3527 
3528 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3529   /* only for Windows static library */
3530   /* reclaim array entries for root threads that are already dead */
3531   added = __kmp_reclaim_dead_roots();
3532 
3533   if (nNeed) {
3534     nNeed -= added;
3535     if (nNeed < 0)
3536       nNeed = 0;
3537   }
3538 #endif
3539   if (nNeed <= 0)
3540     return added;
3541 
3542   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3543   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3544   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3545   // > __kmp_max_nth in one of two ways:
3546   //
3547   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3548   //    may not be resused by another thread, so we may need to increase
3549   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3550   //
3551   // 2) New foreign root(s) are encountered.  We always register new foreign
3552   //    roots. This may cause a smaller # of threads to be allocated at
3553   //    subsequent parallel regions, but the worker threads hang around (and
3554   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3555   //
3556   // Anyway, that is the reason for moving the check to see if
3557   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3558   // instead of having it performed here. -BB
3559 
3560   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3561 
3562   /* compute expansion headroom to check if we can expand */
3563   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3564     /* possible expansion too small -- give up */
3565     return added;
3566   }
3567   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3568 
3569   newCapacity = __kmp_threads_capacity;
3570   do {
3571     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3572                                                           : __kmp_sys_max_nth;
3573   } while (newCapacity < minimumRequiredCapacity);
3574   newThreads = (kmp_info_t **)__kmp_allocate(
3575       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3576   newRoot =
3577       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3578   KMP_MEMCPY(newThreads, __kmp_threads,
3579              __kmp_threads_capacity * sizeof(kmp_info_t *));
3580   KMP_MEMCPY(newRoot, __kmp_root,
3581              __kmp_threads_capacity * sizeof(kmp_root_t *));
3582 
3583   kmp_info_t **temp_threads = __kmp_threads;
3584   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3585   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3586   __kmp_free(temp_threads);
3587   added += newCapacity - __kmp_threads_capacity;
3588   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3589 
3590   if (newCapacity > __kmp_tp_capacity) {
3591     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3592     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3593       __kmp_threadprivate_resize_cache(newCapacity);
3594     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3595       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3596     }
3597     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3598   }
3599 
3600   return added;
3601 }
3602 
3603 /* Register the current thread as a root thread and obtain our gtid. We must
3604    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3605    thread that calls from __kmp_do_serial_initialize() */
3606 int __kmp_register_root(int initial_thread) {
3607   kmp_info_t *root_thread;
3608   kmp_root_t *root;
3609   int gtid;
3610   int capacity;
3611   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3612   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3613   KMP_MB();
3614 
3615   /* 2007-03-02:
3616      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3617      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3618      work as expected -- it may return false (that means there is at least one
3619      empty slot in __kmp_threads array), but it is possible the only free slot
3620      is #0, which is reserved for initial thread and so cannot be used for this
3621      one. Following code workarounds this bug.
3622 
3623      However, right solution seems to be not reserving slot #0 for initial
3624      thread because:
3625      (1) there is no magic in slot #0,
3626      (2) we cannot detect initial thread reliably (the first thread which does
3627         serial initialization may be not a real initial thread).
3628   */
3629   capacity = __kmp_threads_capacity;
3630   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3631     --capacity;
3632   }
3633 
3634   /* see if there are too many threads */
3635   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3636     if (__kmp_tp_cached) {
3637       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3638                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3639                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3640     } else {
3641       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3642                   __kmp_msg_null);
3643     }
3644   }
3645 
3646   /* find an available thread slot */
3647   /* Don't reassign the zero slot since we need that to only be used by initial
3648      thread */
3649   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3650        gtid++)
3651     ;
3652   KA_TRACE(1,
3653            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3654   KMP_ASSERT(gtid < __kmp_threads_capacity);
3655 
3656   /* update global accounting */
3657   __kmp_all_nth++;
3658   TCW_4(__kmp_nth, __kmp_nth + 1);
3659 
3660   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3661   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3662   if (__kmp_adjust_gtid_mode) {
3663     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3664       if (TCR_4(__kmp_gtid_mode) != 2) {
3665         TCW_4(__kmp_gtid_mode, 2);
3666       }
3667     } else {
3668       if (TCR_4(__kmp_gtid_mode) != 1) {
3669         TCW_4(__kmp_gtid_mode, 1);
3670       }
3671     }
3672   }
3673 
3674 #ifdef KMP_ADJUST_BLOCKTIME
3675   /* Adjust blocktime to zero if necessary            */
3676   /* Middle initialization might not have occurred yet */
3677   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3678     if (__kmp_nth > __kmp_avail_proc) {
3679       __kmp_zero_bt = TRUE;
3680     }
3681   }
3682 #endif /* KMP_ADJUST_BLOCKTIME */
3683 
3684   /* setup this new hierarchy */
3685   if (!(root = __kmp_root[gtid])) {
3686     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3687     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3688   }
3689 
3690 #if KMP_STATS_ENABLED
3691   // Initialize stats as soon as possible (right after gtid assignment).
3692   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3693   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3694   KMP_SET_THREAD_STATE(SERIAL_REGION);
3695   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3696 #endif
3697   __kmp_initialize_root(root);
3698 
3699   /* setup new root thread structure */
3700   if (root->r.r_uber_thread) {
3701     root_thread = root->r.r_uber_thread;
3702   } else {
3703     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3704     if (__kmp_storage_map) {
3705       __kmp_print_thread_storage_map(root_thread, gtid);
3706     }
3707     root_thread->th.th_info.ds.ds_gtid = gtid;
3708 #if OMPT_SUPPORT
3709     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3710 #endif
3711     root_thread->th.th_root = root;
3712     if (__kmp_env_consistency_check) {
3713       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3714     }
3715 #if USE_FAST_MEMORY
3716     __kmp_initialize_fast_memory(root_thread);
3717 #endif /* USE_FAST_MEMORY */
3718 
3719 #if KMP_USE_BGET
3720     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3721     __kmp_initialize_bget(root_thread);
3722 #endif
3723     __kmp_init_random(root_thread); // Initialize random number generator
3724   }
3725 
3726   /* setup the serial team held in reserve by the root thread */
3727   if (!root_thread->th.th_serial_team) {
3728     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3729     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3730     root_thread->th.th_serial_team =
3731         __kmp_allocate_team(root, 1, 1,
3732 #if OMPT_SUPPORT
3733                             ompt_data_none, // root parallel id
3734 #endif
3735 #if OMP_40_ENABLED
3736                             proc_bind_default,
3737 #endif
3738                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3739   }
3740   KMP_ASSERT(root_thread->th.th_serial_team);
3741   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3742                 root_thread->th.th_serial_team));
3743 
3744   /* drop root_thread into place */
3745   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3746 
3747   root->r.r_root_team->t.t_threads[0] = root_thread;
3748   root->r.r_hot_team->t.t_threads[0] = root_thread;
3749   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3750   // AC: the team created in reserve, not for execution (it is unused for now).
3751   root_thread->th.th_serial_team->t.t_serialized = 0;
3752   root->r.r_uber_thread = root_thread;
3753 
3754   /* initialize the thread, get it ready to go */
3755   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3756   TCW_4(__kmp_init_gtid, TRUE);
3757 
3758   /* prepare the master thread for get_gtid() */
3759   __kmp_gtid_set_specific(gtid);
3760 
3761 #if USE_ITT_BUILD
3762   __kmp_itt_thread_name(gtid);
3763 #endif /* USE_ITT_BUILD */
3764 
3765 #ifdef KMP_TDATA_GTID
3766   __kmp_gtid = gtid;
3767 #endif
3768   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3769   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3770 
3771   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3772                 "plain=%u\n",
3773                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3774                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3775                 KMP_INIT_BARRIER_STATE));
3776   { // Initialize barrier data.
3777     int b;
3778     for (b = 0; b < bs_last_barrier; ++b) {
3779       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3780 #if USE_DEBUGGER
3781       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3782 #endif
3783     }
3784   }
3785   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3786                    KMP_INIT_BARRIER_STATE);
3787 
3788 #if KMP_AFFINITY_SUPPORTED
3789 #if OMP_40_ENABLED
3790   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3791   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3792   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3793   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3794 #endif
3795 
3796   if (TCR_4(__kmp_init_middle)) {
3797     __kmp_affinity_set_init_mask(gtid, TRUE);
3798   }
3799 #endif /* KMP_AFFINITY_SUPPORTED */
3800 
3801   __kmp_root_counter++;
3802 
3803 #if OMPT_SUPPORT
3804   if (!initial_thread && ompt_enabled.enabled) {
3805 
3806     ompt_thread_t *root_thread = ompt_get_thread();
3807 
3808     ompt_set_thread_state(root_thread, omp_state_overhead);
3809 
3810     if (ompt_enabled.ompt_callback_thread_begin) {
3811       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3812           ompt_thread_initial, __ompt_get_thread_data_internal());
3813     }
3814     ompt_data_t *task_data;
3815     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3816     if (ompt_enabled.ompt_callback_task_create) {
3817       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3818           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3819       // initial task has nothing to return to
3820     }
3821 
3822     ompt_set_thread_state(root_thread, omp_state_work_serial);
3823   }
3824 #endif
3825 
3826   KMP_MB();
3827   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3828 
3829   return gtid;
3830 }
3831 
3832 #if KMP_NESTED_HOT_TEAMS
3833 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3834                                 const int max_level) {
3835   int i, n, nth;
3836   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3837   if (!hot_teams || !hot_teams[level].hot_team) {
3838     return 0;
3839   }
3840   KMP_DEBUG_ASSERT(level < max_level);
3841   kmp_team_t *team = hot_teams[level].hot_team;
3842   nth = hot_teams[level].hot_team_nth;
3843   n = nth - 1; // master is not freed
3844   if (level < max_level - 1) {
3845     for (i = 0; i < nth; ++i) {
3846       kmp_info_t *th = team->t.t_threads[i];
3847       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3848       if (i > 0 && th->th.th_hot_teams) {
3849         __kmp_free(th->th.th_hot_teams);
3850         th->th.th_hot_teams = NULL;
3851       }
3852     }
3853   }
3854   __kmp_free_team(root, team, NULL);
3855   return n;
3856 }
3857 #endif
3858 
3859 // Resets a root thread and clear its root and hot teams.
3860 // Returns the number of __kmp_threads entries directly and indirectly freed.
3861 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3862   kmp_team_t *root_team = root->r.r_root_team;
3863   kmp_team_t *hot_team = root->r.r_hot_team;
3864   int n = hot_team->t.t_nproc;
3865   int i;
3866 
3867   KMP_DEBUG_ASSERT(!root->r.r_active);
3868 
3869   root->r.r_root_team = NULL;
3870   root->r.r_hot_team = NULL;
3871   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3872   // before call to __kmp_free_team().
3873   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3874 #if KMP_NESTED_HOT_TEAMS
3875   if (__kmp_hot_teams_max_level >
3876       0) { // need to free nested hot teams and their threads if any
3877     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3878       kmp_info_t *th = hot_team->t.t_threads[i];
3879       if (__kmp_hot_teams_max_level > 1) {
3880         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3881       }
3882       if (th->th.th_hot_teams) {
3883         __kmp_free(th->th.th_hot_teams);
3884         th->th.th_hot_teams = NULL;
3885       }
3886     }
3887   }
3888 #endif
3889   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3890 
3891   // Before we can reap the thread, we need to make certain that all other
3892   // threads in the teams that had this root as ancestor have stopped trying to
3893   // steal tasks.
3894   if (__kmp_tasking_mode != tskm_immediate_exec) {
3895     __kmp_wait_to_unref_task_teams();
3896   }
3897 
3898 #if KMP_OS_WINDOWS
3899   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3900   KA_TRACE(
3901       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3902            "\n",
3903            (LPVOID) & (root->r.r_uber_thread->th),
3904            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3905   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3906 #endif /* KMP_OS_WINDOWS */
3907 
3908 #if OMPT_SUPPORT
3909   if (ompt_enabled.ompt_callback_thread_end) {
3910     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3911         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3912   }
3913 #endif
3914 
3915   TCW_4(__kmp_nth,
3916         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3917   root->r.r_cg_nthreads--;
3918 
3919   __kmp_reap_thread(root->r.r_uber_thread, 1);
3920 
3921   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3922   // of freeing.
3923   root->r.r_uber_thread = NULL;
3924   /* mark root as no longer in use */
3925   root->r.r_begin = FALSE;
3926 
3927   return n;
3928 }
3929 
3930 void __kmp_unregister_root_current_thread(int gtid) {
3931   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3932   /* this lock should be ok, since unregister_root_current_thread is never
3933      called during an abort, only during a normal close. furthermore, if you
3934      have the forkjoin lock, you should never try to get the initz lock */
3935   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3936   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3937     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3938                   "exiting T#%d\n",
3939                   gtid));
3940     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3941     return;
3942   }
3943   kmp_root_t *root = __kmp_root[gtid];
3944 
3945   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3946   KMP_ASSERT(KMP_UBER_GTID(gtid));
3947   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3948   KMP_ASSERT(root->r.r_active == FALSE);
3949 
3950   KMP_MB();
3951 
3952 #if OMP_45_ENABLED
3953   kmp_info_t *thread = __kmp_threads[gtid];
3954   kmp_team_t *team = thread->th.th_team;
3955   kmp_task_team_t *task_team = thread->th.th_task_team;
3956 
3957   // we need to wait for the proxy tasks before finishing the thread
3958   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3959 #if OMPT_SUPPORT
3960     // the runtime is shutting down so we won't report any events
3961     thread->th.ompt_thread_info.state = omp_state_undefined;
3962 #endif
3963     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3964   }
3965 #endif
3966 
3967   __kmp_reset_root(gtid, root);
3968 
3969   /* free up this thread slot */
3970   __kmp_gtid_set_specific(KMP_GTID_DNE);
3971 #ifdef KMP_TDATA_GTID
3972   __kmp_gtid = KMP_GTID_DNE;
3973 #endif
3974 
3975   KMP_MB();
3976   KC_TRACE(10,
3977            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3978 
3979   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3980 }
3981 
3982 #if KMP_OS_WINDOWS
3983 /* __kmp_forkjoin_lock must be already held
3984    Unregisters a root thread that is not the current thread.  Returns the number
3985    of __kmp_threads entries freed as a result. */
3986 static int __kmp_unregister_root_other_thread(int gtid) {
3987   kmp_root_t *root = __kmp_root[gtid];
3988   int r;
3989 
3990   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3991   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3992   KMP_ASSERT(KMP_UBER_GTID(gtid));
3993   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3994   KMP_ASSERT(root->r.r_active == FALSE);
3995 
3996   r = __kmp_reset_root(gtid, root);
3997   KC_TRACE(10,
3998            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3999   return r;
4000 }
4001 #endif
4002 
4003 #if KMP_DEBUG
4004 void __kmp_task_info() {
4005 
4006   kmp_int32 gtid = __kmp_entry_gtid();
4007   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4008   kmp_info_t *this_thr = __kmp_threads[gtid];
4009   kmp_team_t *steam = this_thr->th.th_serial_team;
4010   kmp_team_t *team = this_thr->th.th_team;
4011 
4012   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4013                "ptask=%p\n",
4014                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4015                team->t.t_implicit_task_taskdata[tid].td_parent);
4016 }
4017 #endif // KMP_DEBUG
4018 
4019 /* TODO optimize with one big memclr, take out what isn't needed, split
4020    responsibility to workers as much as possible, and delay initialization of
4021    features as much as possible  */
4022 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4023                                   int tid, int gtid) {
4024   /* this_thr->th.th_info.ds.ds_gtid is setup in
4025      kmp_allocate_thread/create_worker.
4026      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4027   kmp_info_t *master = team->t.t_threads[0];
4028   KMP_DEBUG_ASSERT(this_thr != NULL);
4029   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4030   KMP_DEBUG_ASSERT(team);
4031   KMP_DEBUG_ASSERT(team->t.t_threads);
4032   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4033   KMP_DEBUG_ASSERT(master);
4034   KMP_DEBUG_ASSERT(master->th.th_root);
4035 
4036   KMP_MB();
4037 
4038   TCW_SYNC_PTR(this_thr->th.th_team, team);
4039 
4040   this_thr->th.th_info.ds.ds_tid = tid;
4041   this_thr->th.th_set_nproc = 0;
4042   if (__kmp_tasking_mode != tskm_immediate_exec)
4043     // When tasking is possible, threads are not safe to reap until they are
4044     // done tasking; this will be set when tasking code is exited in wait
4045     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4046   else // no tasking --> always safe to reap
4047     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4048 #if OMP_40_ENABLED
4049   this_thr->th.th_set_proc_bind = proc_bind_default;
4050 #if KMP_AFFINITY_SUPPORTED
4051   this_thr->th.th_new_place = this_thr->th.th_current_place;
4052 #endif
4053 #endif
4054   this_thr->th.th_root = master->th.th_root;
4055 
4056   /* setup the thread's cache of the team structure */
4057   this_thr->th.th_team_nproc = team->t.t_nproc;
4058   this_thr->th.th_team_master = master;
4059   this_thr->th.th_team_serialized = team->t.t_serialized;
4060   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4061 
4062   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4063 
4064   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4065                 tid, gtid, this_thr, this_thr->th.th_current_task));
4066 
4067   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4068                            team, tid, TRUE);
4069 
4070   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4071                 tid, gtid, this_thr, this_thr->th.th_current_task));
4072   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4073   // __kmp_initialize_team()?
4074 
4075   /* TODO no worksharing in speculative threads */
4076   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4077 
4078   this_thr->th.th_local.this_construct = 0;
4079 
4080   if (!this_thr->th.th_pri_common) {
4081     this_thr->th.th_pri_common =
4082         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4083     if (__kmp_storage_map) {
4084       __kmp_print_storage_map_gtid(
4085           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4086           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4087     }
4088     this_thr->th.th_pri_head = NULL;
4089   }
4090 
4091   /* Initialize dynamic dispatch */
4092   {
4093     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4094     // Use team max_nproc since this will never change for the team.
4095     size_t disp_size =
4096         sizeof(dispatch_private_info_t) *
4097         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4098     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4099                   team->t.t_max_nproc));
4100     KMP_ASSERT(dispatch);
4101     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4102     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4103 
4104     dispatch->th_disp_index = 0;
4105 #if OMP_45_ENABLED
4106     dispatch->th_doacross_buf_idx = 0;
4107 #endif
4108     if (!dispatch->th_disp_buffer) {
4109       dispatch->th_disp_buffer =
4110           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4111 
4112       if (__kmp_storage_map) {
4113         __kmp_print_storage_map_gtid(
4114             gtid, &dispatch->th_disp_buffer[0],
4115             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4116                                           ? 1
4117                                           : __kmp_dispatch_num_buffers],
4118             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4119                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4120             gtid, team->t.t_id, gtid);
4121       }
4122     } else {
4123       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4124     }
4125 
4126     dispatch->th_dispatch_pr_current = 0;
4127     dispatch->th_dispatch_sh_current = 0;
4128 
4129     dispatch->th_deo_fcn = 0; /* ORDERED     */
4130     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4131   }
4132 
4133   this_thr->th.th_next_pool = NULL;
4134 
4135   if (!this_thr->th.th_task_state_memo_stack) {
4136     size_t i;
4137     this_thr->th.th_task_state_memo_stack =
4138         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4139     this_thr->th.th_task_state_top = 0;
4140     this_thr->th.th_task_state_stack_sz = 4;
4141     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4142          ++i) // zero init the stack
4143       this_thr->th.th_task_state_memo_stack[i] = 0;
4144   }
4145 
4146   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4147   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4148 
4149   KMP_MB();
4150 }
4151 
4152 /* allocate a new thread for the requesting team. this is only called from
4153    within a forkjoin critical section. we will first try to get an available
4154    thread from the thread pool. if none is available, we will fork a new one
4155    assuming we are able to create a new one. this should be assured, as the
4156    caller should check on this first. */
4157 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4158                                   int new_tid) {
4159   kmp_team_t *serial_team;
4160   kmp_info_t *new_thr;
4161   int new_gtid;
4162 
4163   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4164   KMP_DEBUG_ASSERT(root && team);
4165 #if !KMP_NESTED_HOT_TEAMS
4166   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4167 #endif
4168   KMP_MB();
4169 
4170   /* first, try to get one from the thread pool */
4171   if (__kmp_thread_pool) {
4172 
4173     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4174     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4175     if (new_thr == __kmp_thread_pool_insert_pt) {
4176       __kmp_thread_pool_insert_pt = NULL;
4177     }
4178     TCW_4(new_thr->th.th_in_pool, FALSE);
4179     // Don't touch th_active_in_pool or th_active.
4180     // The worker thread adjusts those flags as it sleeps/awakens.
4181     __kmp_thread_pool_nth--;
4182 
4183     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4184                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4185     KMP_ASSERT(!new_thr->th.th_team);
4186     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4187     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4188 
4189     /* setup the thread structure */
4190     __kmp_initialize_info(new_thr, team, new_tid,
4191                           new_thr->th.th_info.ds.ds_gtid);
4192     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4193 
4194     TCW_4(__kmp_nth, __kmp_nth + 1);
4195     root->r.r_cg_nthreads++;
4196 
4197     new_thr->th.th_task_state = 0;
4198     new_thr->th.th_task_state_top = 0;
4199     new_thr->th.th_task_state_stack_sz = 4;
4200 
4201 #ifdef KMP_ADJUST_BLOCKTIME
4202     /* Adjust blocktime back to zero if necessary */
4203     /* Middle initialization might not have occurred yet */
4204     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4205       if (__kmp_nth > __kmp_avail_proc) {
4206         __kmp_zero_bt = TRUE;
4207       }
4208     }
4209 #endif /* KMP_ADJUST_BLOCKTIME */
4210 
4211 #if KMP_DEBUG
4212     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4213     // KMP_BARRIER_PARENT_FLAG.
4214     int b;
4215     kmp_balign_t *balign = new_thr->th.th_bar;
4216     for (b = 0; b < bs_last_barrier; ++b)
4217       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4218 #endif
4219 
4220     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4221                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4222 
4223     KMP_MB();
4224     return new_thr;
4225   }
4226 
4227   /* no, well fork a new one */
4228   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4229   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4230 
4231 #if KMP_USE_MONITOR
4232   // If this is the first worker thread the RTL is creating, then also
4233   // launch the monitor thread.  We try to do this as early as possible.
4234   if (!TCR_4(__kmp_init_monitor)) {
4235     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4236     if (!TCR_4(__kmp_init_monitor)) {
4237       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4238       TCW_4(__kmp_init_monitor, 1);
4239       __kmp_create_monitor(&__kmp_monitor);
4240       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4241 #if KMP_OS_WINDOWS
4242       // AC: wait until monitor has started. This is a fix for CQ232808.
4243       // The reason is that if the library is loaded/unloaded in a loop with
4244       // small (parallel) work in between, then there is high probability that
4245       // monitor thread started after the library shutdown. At shutdown it is
4246       // too late to cope with the problem, because when the master is in
4247       // DllMain (process detach) the monitor has no chances to start (it is
4248       // blocked), and master has no means to inform the monitor that the
4249       // library has gone, because all the memory which the monitor can access
4250       // is going to be released/reset.
4251       while (TCR_4(__kmp_init_monitor) < 2) {
4252         KMP_YIELD(TRUE);
4253       }
4254       KF_TRACE(10, ("after monitor thread has started\n"));
4255 #endif
4256     }
4257     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4258   }
4259 #endif
4260 
4261   KMP_MB();
4262   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4263     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4264   }
4265 
4266   /* allocate space for it. */
4267   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4268 
4269   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4270 
4271   if (__kmp_storage_map) {
4272     __kmp_print_thread_storage_map(new_thr, new_gtid);
4273   }
4274 
4275   // add the reserve serialized team, initialized from the team's master thread
4276   {
4277     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4278     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4279     new_thr->th.th_serial_team = serial_team =
4280         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4281 #if OMPT_SUPPORT
4282                                           ompt_data_none, // root parallel id
4283 #endif
4284 #if OMP_40_ENABLED
4285                                           proc_bind_default,
4286 #endif
4287                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4288   }
4289   KMP_ASSERT(serial_team);
4290   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4291   // execution (it is unused for now).
4292   serial_team->t.t_threads[0] = new_thr;
4293   KF_TRACE(10,
4294            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4295             new_thr));
4296 
4297   /* setup the thread structures */
4298   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4299 
4300 #if USE_FAST_MEMORY
4301   __kmp_initialize_fast_memory(new_thr);
4302 #endif /* USE_FAST_MEMORY */
4303 
4304 #if KMP_USE_BGET
4305   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4306   __kmp_initialize_bget(new_thr);
4307 #endif
4308 
4309   __kmp_init_random(new_thr); // Initialize random number generator
4310 
4311   /* Initialize these only once when thread is grabbed for a team allocation */
4312   KA_TRACE(20,
4313            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4314             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4315 
4316   int b;
4317   kmp_balign_t *balign = new_thr->th.th_bar;
4318   for (b = 0; b < bs_last_barrier; ++b) {
4319     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4320     balign[b].bb.team = NULL;
4321     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4322     balign[b].bb.use_oncore_barrier = 0;
4323   }
4324 
4325   new_thr->th.th_spin_here = FALSE;
4326   new_thr->th.th_next_waiting = 0;
4327 
4328 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4329   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4330   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4331   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4332   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4333 #endif
4334 
4335   TCW_4(new_thr->th.th_in_pool, FALSE);
4336   new_thr->th.th_active_in_pool = FALSE;
4337   TCW_4(new_thr->th.th_active, TRUE);
4338 
4339   /* adjust the global counters */
4340   __kmp_all_nth++;
4341   __kmp_nth++;
4342 
4343   root->r.r_cg_nthreads++;
4344 
4345   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4346   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4347   if (__kmp_adjust_gtid_mode) {
4348     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4349       if (TCR_4(__kmp_gtid_mode) != 2) {
4350         TCW_4(__kmp_gtid_mode, 2);
4351       }
4352     } else {
4353       if (TCR_4(__kmp_gtid_mode) != 1) {
4354         TCW_4(__kmp_gtid_mode, 1);
4355       }
4356     }
4357   }
4358 
4359 #ifdef KMP_ADJUST_BLOCKTIME
4360   /* Adjust blocktime back to zero if necessary       */
4361   /* Middle initialization might not have occurred yet */
4362   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4363     if (__kmp_nth > __kmp_avail_proc) {
4364       __kmp_zero_bt = TRUE;
4365     }
4366   }
4367 #endif /* KMP_ADJUST_BLOCKTIME */
4368 
4369   /* actually fork it and create the new worker thread */
4370   KF_TRACE(
4371       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4372   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4373   KF_TRACE(10,
4374            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4375 
4376   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4377                 new_gtid));
4378   KMP_MB();
4379   return new_thr;
4380 }
4381 
4382 /* Reinitialize team for reuse.
4383    The hot team code calls this case at every fork barrier, so EPCC barrier
4384    test are extremely sensitive to changes in it, esp. writes to the team
4385    struct, which cause a cache invalidation in all threads.
4386    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4387 static void __kmp_reinitialize_team(kmp_team_t *team,
4388                                     kmp_internal_control_t *new_icvs,
4389                                     ident_t *loc) {
4390   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4391                 team->t.t_threads[0], team));
4392   KMP_DEBUG_ASSERT(team && new_icvs);
4393   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4394   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4395 
4396   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4397   // Copy ICVs to the master thread's implicit taskdata
4398   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4399   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4400 
4401   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4402                 team->t.t_threads[0], team));
4403 }
4404 
4405 /* Initialize the team data structure.
4406    This assumes the t_threads and t_max_nproc are already set.
4407    Also, we don't touch the arguments */
4408 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4409                                   kmp_internal_control_t *new_icvs,
4410                                   ident_t *loc) {
4411   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4412 
4413   /* verify */
4414   KMP_DEBUG_ASSERT(team);
4415   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4416   KMP_DEBUG_ASSERT(team->t.t_threads);
4417   KMP_MB();
4418 
4419   team->t.t_master_tid = 0; /* not needed */
4420   /* team->t.t_master_bar;        not needed */
4421   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4422   team->t.t_nproc = new_nproc;
4423 
4424   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4425   team->t.t_next_pool = NULL;
4426   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4427    * up hot team */
4428 
4429   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4430   team->t.t_invoke = NULL; /* not needed */
4431 
4432   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4433   team->t.t_sched.sched = new_icvs->sched.sched;
4434 
4435 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4436   team->t.t_fp_control_saved = FALSE; /* not needed */
4437   team->t.t_x87_fpu_control_word = 0; /* not needed */
4438   team->t.t_mxcsr = 0; /* not needed */
4439 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4440 
4441   team->t.t_construct = 0;
4442 
4443   team->t.t_ordered.dt.t_value = 0;
4444   team->t.t_master_active = FALSE;
4445 
4446   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4447 
4448 #ifdef KMP_DEBUG
4449   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4450 #endif
4451   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4452 
4453   team->t.t_control_stack_top = NULL;
4454 
4455   __kmp_reinitialize_team(team, new_icvs, loc);
4456 
4457   KMP_MB();
4458   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4459 }
4460 
4461 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4462 /* Sets full mask for thread and returns old mask, no changes to structures. */
4463 static void
4464 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4465   if (KMP_AFFINITY_CAPABLE()) {
4466     int status;
4467     if (old_mask != NULL) {
4468       status = __kmp_get_system_affinity(old_mask, TRUE);
4469       int error = errno;
4470       if (status != 0) {
4471         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4472                     __kmp_msg_null);
4473       }
4474     }
4475     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4476   }
4477 }
4478 #endif
4479 
4480 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4481 
4482 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4483 // It calculats the worker + master thread's partition based upon the parent
4484 // thread's partition, and binds each worker to a thread in their partition.
4485 // The master thread's partition should already include its current binding.
4486 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4487   // Copy the master thread's place partion to the team struct
4488   kmp_info_t *master_th = team->t.t_threads[0];
4489   KMP_DEBUG_ASSERT(master_th != NULL);
4490   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4491   int first_place = master_th->th.th_first_place;
4492   int last_place = master_th->th.th_last_place;
4493   int masters_place = master_th->th.th_current_place;
4494   team->t.t_first_place = first_place;
4495   team->t.t_last_place = last_place;
4496 
4497   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4498                 "bound to place %d partition = [%d,%d]\n",
4499                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4500                 team->t.t_id, masters_place, first_place, last_place));
4501 
4502   switch (proc_bind) {
4503 
4504   case proc_bind_default:
4505     // serial teams might have the proc_bind policy set to proc_bind_default. It
4506     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4507     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4508     break;
4509 
4510   case proc_bind_master: {
4511     int f;
4512     int n_th = team->t.t_nproc;
4513     for (f = 1; f < n_th; f++) {
4514       kmp_info_t *th = team->t.t_threads[f];
4515       KMP_DEBUG_ASSERT(th != NULL);
4516       th->th.th_first_place = first_place;
4517       th->th.th_last_place = last_place;
4518       th->th.th_new_place = masters_place;
4519 
4520       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4521                      "partition = [%d,%d]\n",
4522                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4523                      f, masters_place, first_place, last_place));
4524     }
4525   } break;
4526 
4527   case proc_bind_close: {
4528     int f;
4529     int n_th = team->t.t_nproc;
4530     int n_places;
4531     if (first_place <= last_place) {
4532       n_places = last_place - first_place + 1;
4533     } else {
4534       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4535     }
4536     if (n_th <= n_places) {
4537       int place = masters_place;
4538       for (f = 1; f < n_th; f++) {
4539         kmp_info_t *th = team->t.t_threads[f];
4540         KMP_DEBUG_ASSERT(th != NULL);
4541 
4542         if (place == last_place) {
4543           place = first_place;
4544         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4545           place = 0;
4546         } else {
4547           place++;
4548         }
4549         th->th.th_first_place = first_place;
4550         th->th.th_last_place = last_place;
4551         th->th.th_new_place = place;
4552 
4553         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4554                        "partition = [%d,%d]\n",
4555                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4556                        team->t.t_id, f, place, first_place, last_place));
4557       }
4558     } else {
4559       int S, rem, gap, s_count;
4560       S = n_th / n_places;
4561       s_count = 0;
4562       rem = n_th - (S * n_places);
4563       gap = rem > 0 ? n_places / rem : n_places;
4564       int place = masters_place;
4565       int gap_ct = gap;
4566       for (f = 0; f < n_th; f++) {
4567         kmp_info_t *th = team->t.t_threads[f];
4568         KMP_DEBUG_ASSERT(th != NULL);
4569 
4570         th->th.th_first_place = first_place;
4571         th->th.th_last_place = last_place;
4572         th->th.th_new_place = place;
4573         s_count++;
4574 
4575         if ((s_count == S) && rem && (gap_ct == gap)) {
4576           // do nothing, add an extra thread to place on next iteration
4577         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4578           // we added an extra thread to this place; move to next place
4579           if (place == last_place) {
4580             place = first_place;
4581           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4582             place = 0;
4583           } else {
4584             place++;
4585           }
4586           s_count = 0;
4587           gap_ct = 1;
4588           rem--;
4589         } else if (s_count == S) { // place full; don't add extra
4590           if (place == last_place) {
4591             place = first_place;
4592           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4593             place = 0;
4594           } else {
4595             place++;
4596           }
4597           gap_ct++;
4598           s_count = 0;
4599         }
4600 
4601         KA_TRACE(100,
4602                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4603                   "partition = [%d,%d]\n",
4604                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4605                   th->th.th_new_place, first_place, last_place));
4606       }
4607       KMP_DEBUG_ASSERT(place == masters_place);
4608     }
4609   } break;
4610 
4611   case proc_bind_spread: {
4612     int f;
4613     int n_th = team->t.t_nproc;
4614     int n_places;
4615     int thidx;
4616     if (first_place <= last_place) {
4617       n_places = last_place - first_place + 1;
4618     } else {
4619       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4620     }
4621     if (n_th <= n_places) {
4622       int place = -1;
4623 
4624       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4625         int S = n_places / n_th;
4626         int s_count, rem, gap, gap_ct;
4627 
4628         place = masters_place;
4629         rem = n_places - n_th * S;
4630         gap = rem ? n_th / rem : 1;
4631         gap_ct = gap;
4632         thidx = n_th;
4633         if (update_master_only == 1)
4634           thidx = 1;
4635         for (f = 0; f < thidx; f++) {
4636           kmp_info_t *th = team->t.t_threads[f];
4637           KMP_DEBUG_ASSERT(th != NULL);
4638 
4639           th->th.th_first_place = place;
4640           th->th.th_new_place = place;
4641           s_count = 1;
4642           while (s_count < S) {
4643             if (place == last_place) {
4644               place = first_place;
4645             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4646               place = 0;
4647             } else {
4648               place++;
4649             }
4650             s_count++;
4651           }
4652           if (rem && (gap_ct == gap)) {
4653             if (place == last_place) {
4654               place = first_place;
4655             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4656               place = 0;
4657             } else {
4658               place++;
4659             }
4660             rem--;
4661             gap_ct = 0;
4662           }
4663           th->th.th_last_place = place;
4664           gap_ct++;
4665 
4666           if (place == last_place) {
4667             place = first_place;
4668           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4669             place = 0;
4670           } else {
4671             place++;
4672           }
4673 
4674           KA_TRACE(100,
4675                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4676                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4677                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4678                     f, th->th.th_new_place, th->th.th_first_place,
4679                     th->th.th_last_place, __kmp_affinity_num_masks));
4680         }
4681       } else {
4682         /* Having uniform space of available computation places I can create
4683            T partitions of round(P/T) size and put threads into the first
4684            place of each partition. */
4685         double current = static_cast<double>(masters_place);
4686         double spacing =
4687             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4688         int first, last;
4689         kmp_info_t *th;
4690 
4691         thidx = n_th + 1;
4692         if (update_master_only == 1)
4693           thidx = 1;
4694         for (f = 0; f < thidx; f++) {
4695           first = static_cast<int>(current);
4696           last = static_cast<int>(current + spacing) - 1;
4697           KMP_DEBUG_ASSERT(last >= first);
4698           if (first >= n_places) {
4699             if (masters_place) {
4700               first -= n_places;
4701               last -= n_places;
4702               if (first == (masters_place + 1)) {
4703                 KMP_DEBUG_ASSERT(f == n_th);
4704                 first--;
4705               }
4706               if (last == masters_place) {
4707                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4708                 last--;
4709               }
4710             } else {
4711               KMP_DEBUG_ASSERT(f == n_th);
4712               first = 0;
4713               last = 0;
4714             }
4715           }
4716           if (last >= n_places) {
4717             last = (n_places - 1);
4718           }
4719           place = first;
4720           current += spacing;
4721           if (f < n_th) {
4722             KMP_DEBUG_ASSERT(0 <= first);
4723             KMP_DEBUG_ASSERT(n_places > first);
4724             KMP_DEBUG_ASSERT(0 <= last);
4725             KMP_DEBUG_ASSERT(n_places > last);
4726             KMP_DEBUG_ASSERT(last_place >= first_place);
4727             th = team->t.t_threads[f];
4728             KMP_DEBUG_ASSERT(th);
4729             th->th.th_first_place = first;
4730             th->th.th_new_place = place;
4731             th->th.th_last_place = last;
4732 
4733             KA_TRACE(100,
4734                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4735                       "partition = [%d,%d], spacing = %.4f\n",
4736                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4737                       team->t.t_id, f, th->th.th_new_place,
4738                       th->th.th_first_place, th->th.th_last_place, spacing));
4739           }
4740         }
4741       }
4742       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4743     } else {
4744       int S, rem, gap, s_count;
4745       S = n_th / n_places;
4746       s_count = 0;
4747       rem = n_th - (S * n_places);
4748       gap = rem > 0 ? n_places / rem : n_places;
4749       int place = masters_place;
4750       int gap_ct = gap;
4751       thidx = n_th;
4752       if (update_master_only == 1)
4753         thidx = 1;
4754       for (f = 0; f < thidx; f++) {
4755         kmp_info_t *th = team->t.t_threads[f];
4756         KMP_DEBUG_ASSERT(th != NULL);
4757 
4758         th->th.th_first_place = place;
4759         th->th.th_last_place = place;
4760         th->th.th_new_place = place;
4761         s_count++;
4762 
4763         if ((s_count == S) && rem && (gap_ct == gap)) {
4764           // do nothing, add an extra thread to place on next iteration
4765         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4766           // we added an extra thread to this place; move on to next place
4767           if (place == last_place) {
4768             place = first_place;
4769           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4770             place = 0;
4771           } else {
4772             place++;
4773           }
4774           s_count = 0;
4775           gap_ct = 1;
4776           rem--;
4777         } else if (s_count == S) { // place is full; don't add extra thread
4778           if (place == last_place) {
4779             place = first_place;
4780           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4781             place = 0;
4782           } else {
4783             place++;
4784           }
4785           gap_ct++;
4786           s_count = 0;
4787         }
4788 
4789         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4790                        "partition = [%d,%d]\n",
4791                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4792                        team->t.t_id, f, th->th.th_new_place,
4793                        th->th.th_first_place, th->th.th_last_place));
4794       }
4795       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4796     }
4797   } break;
4798 
4799   default:
4800     break;
4801   }
4802 
4803   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4804 }
4805 
4806 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4807 
4808 /* allocate a new team data structure to use.  take one off of the free pool if
4809    available */
4810 kmp_team_t *
4811 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4812 #if OMPT_SUPPORT
4813                     ompt_data_t ompt_parallel_data,
4814 #endif
4815 #if OMP_40_ENABLED
4816                     kmp_proc_bind_t new_proc_bind,
4817 #endif
4818                     kmp_internal_control_t *new_icvs,
4819                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4820   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4821   int f;
4822   kmp_team_t *team;
4823   int use_hot_team = !root->r.r_active;
4824   int level = 0;
4825 
4826   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4827   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4828   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4829   KMP_MB();
4830 
4831 #if KMP_NESTED_HOT_TEAMS
4832   kmp_hot_team_ptr_t *hot_teams;
4833   if (master) {
4834     team = master->th.th_team;
4835     level = team->t.t_active_level;
4836     if (master->th.th_teams_microtask) { // in teams construct?
4837       if (master->th.th_teams_size.nteams > 1 &&
4838           ( // #teams > 1
4839               team->t.t_pkfn ==
4840                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4841               master->th.th_teams_level <
4842                   team->t.t_level)) { // or nested parallel inside the teams
4843         ++level; // not increment if #teams==1, or for outer fork of the teams;
4844         // increment otherwise
4845       }
4846     }
4847     hot_teams = master->th.th_hot_teams;
4848     if (level < __kmp_hot_teams_max_level && hot_teams &&
4849         hot_teams[level]
4850             .hot_team) { // hot team has already been allocated for given level
4851       use_hot_team = 1;
4852     } else {
4853       use_hot_team = 0;
4854     }
4855   }
4856 #endif
4857   // Optimization to use a "hot" team
4858   if (use_hot_team && new_nproc > 1) {
4859     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4860 #if KMP_NESTED_HOT_TEAMS
4861     team = hot_teams[level].hot_team;
4862 #else
4863     team = root->r.r_hot_team;
4864 #endif
4865 #if KMP_DEBUG
4866     if (__kmp_tasking_mode != tskm_immediate_exec) {
4867       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4868                     "task_team[1] = %p before reinit\n",
4869                     team->t.t_task_team[0], team->t.t_task_team[1]));
4870     }
4871 #endif
4872 
4873     // Has the number of threads changed?
4874     /* Let's assume the most common case is that the number of threads is
4875        unchanged, and put that case first. */
4876     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4877       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4878       // This case can mean that omp_set_num_threads() was called and the hot
4879       // team size was already reduced, so we check the special flag
4880       if (team->t.t_size_changed == -1) {
4881         team->t.t_size_changed = 1;
4882       } else {
4883         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4884       }
4885 
4886       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4887       kmp_r_sched_t new_sched = new_icvs->sched;
4888       // set master's schedule as new run-time schedule
4889       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4890 
4891       __kmp_reinitialize_team(team, new_icvs,
4892                               root->r.r_uber_thread->th.th_ident);
4893 
4894       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4895                     team->t.t_threads[0], team));
4896       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4897 
4898 #if OMP_40_ENABLED
4899 #if KMP_AFFINITY_SUPPORTED
4900       if ((team->t.t_size_changed == 0) &&
4901           (team->t.t_proc_bind == new_proc_bind)) {
4902         if (new_proc_bind == proc_bind_spread) {
4903           __kmp_partition_places(
4904               team, 1); // add flag to update only master for spread
4905         }
4906         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4907                        "proc_bind = %d, partition = [%d,%d]\n",
4908                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4909                        team->t.t_last_place));
4910       } else {
4911         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4912         __kmp_partition_places(team);
4913       }
4914 #else
4915       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4916 #endif /* KMP_AFFINITY_SUPPORTED */
4917 #endif /* OMP_40_ENABLED */
4918     } else if (team->t.t_nproc > new_nproc) {
4919       KA_TRACE(20,
4920                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4921                 new_nproc));
4922 
4923       team->t.t_size_changed = 1;
4924 #if KMP_NESTED_HOT_TEAMS
4925       if (__kmp_hot_teams_mode == 0) {
4926         // AC: saved number of threads should correspond to team's value in this
4927         // mode, can be bigger in mode 1, when hot team has threads in reserve
4928         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4929         hot_teams[level].hot_team_nth = new_nproc;
4930 #endif // KMP_NESTED_HOT_TEAMS
4931         /* release the extra threads we don't need any more */
4932         for (f = new_nproc; f < team->t.t_nproc; f++) {
4933           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4934           if (__kmp_tasking_mode != tskm_immediate_exec) {
4935             // When decreasing team size, threads no longer in the team should
4936             // unref task team.
4937             team->t.t_threads[f]->th.th_task_team = NULL;
4938           }
4939           __kmp_free_thread(team->t.t_threads[f]);
4940           team->t.t_threads[f] = NULL;
4941         }
4942 #if KMP_NESTED_HOT_TEAMS
4943       } // (__kmp_hot_teams_mode == 0)
4944       else {
4945         // When keeping extra threads in team, switch threads to wait on own
4946         // b_go flag
4947         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4948           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4949           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4950           for (int b = 0; b < bs_last_barrier; ++b) {
4951             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4952               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4953             }
4954             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4955           }
4956         }
4957       }
4958 #endif // KMP_NESTED_HOT_TEAMS
4959       team->t.t_nproc = new_nproc;
4960       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4961       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4962       __kmp_reinitialize_team(team, new_icvs,
4963                               root->r.r_uber_thread->th.th_ident);
4964 
4965       /* update the remaining threads */
4966       for (f = 0; f < new_nproc; ++f) {
4967         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4968       }
4969       // restore the current task state of the master thread: should be the
4970       // implicit task
4971       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4972                     team->t.t_threads[0], team));
4973 
4974       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4975 
4976 #ifdef KMP_DEBUG
4977       for (f = 0; f < team->t.t_nproc; f++) {
4978         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4979                          team->t.t_threads[f]->th.th_team_nproc ==
4980                              team->t.t_nproc);
4981       }
4982 #endif
4983 
4984 #if OMP_40_ENABLED
4985       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4986 #if KMP_AFFINITY_SUPPORTED
4987       __kmp_partition_places(team);
4988 #endif
4989 #endif
4990     } else { // team->t.t_nproc < new_nproc
4991 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4992       kmp_affin_mask_t *old_mask;
4993       if (KMP_AFFINITY_CAPABLE()) {
4994         KMP_CPU_ALLOC(old_mask);
4995       }
4996 #endif
4997 
4998       KA_TRACE(20,
4999                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5000                 new_nproc));
5001 
5002       team->t.t_size_changed = 1;
5003 
5004 #if KMP_NESTED_HOT_TEAMS
5005       int avail_threads = hot_teams[level].hot_team_nth;
5006       if (new_nproc < avail_threads)
5007         avail_threads = new_nproc;
5008       kmp_info_t **other_threads = team->t.t_threads;
5009       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5010         // Adjust barrier data of reserved threads (if any) of the team
5011         // Other data will be set in __kmp_initialize_info() below.
5012         int b;
5013         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5014         for (b = 0; b < bs_last_barrier; ++b) {
5015           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5016           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5017 #if USE_DEBUGGER
5018           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5019 #endif
5020         }
5021       }
5022       if (hot_teams[level].hot_team_nth >= new_nproc) {
5023         // we have all needed threads in reserve, no need to allocate any
5024         // this only possible in mode 1, cannot have reserved threads in mode 0
5025         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5026         team->t.t_nproc = new_nproc; // just get reserved threads involved
5027       } else {
5028         // we may have some threads in reserve, but not enough
5029         team->t.t_nproc =
5030             hot_teams[level]
5031                 .hot_team_nth; // get reserved threads involved if any
5032         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5033 #endif // KMP_NESTED_HOT_TEAMS
5034         if (team->t.t_max_nproc < new_nproc) {
5035           /* reallocate larger arrays */
5036           __kmp_reallocate_team_arrays(team, new_nproc);
5037           __kmp_reinitialize_team(team, new_icvs, NULL);
5038         }
5039 
5040 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5041         /* Temporarily set full mask for master thread before creation of
5042            workers. The reason is that workers inherit the affinity from master,
5043            so if a lot of workers are created on the single core quickly, they
5044            don't get a chance to set their own affinity for a long time. */
5045         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5046 #endif
5047 
5048         /* allocate new threads for the hot team */
5049         for (f = team->t.t_nproc; f < new_nproc; f++) {
5050           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5051           KMP_DEBUG_ASSERT(new_worker);
5052           team->t.t_threads[f] = new_worker;
5053 
5054           KA_TRACE(20,
5055                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5056                     "join=%llu, plain=%llu\n",
5057                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5058                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5059                     team->t.t_bar[bs_plain_barrier].b_arrived));
5060 
5061           { // Initialize barrier data for new threads.
5062             int b;
5063             kmp_balign_t *balign = new_worker->th.th_bar;
5064             for (b = 0; b < bs_last_barrier; ++b) {
5065               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5066               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5067                                KMP_BARRIER_PARENT_FLAG);
5068 #if USE_DEBUGGER
5069               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5070 #endif
5071             }
5072           }
5073         }
5074 
5075 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5076         if (KMP_AFFINITY_CAPABLE()) {
5077           /* Restore initial master thread's affinity mask */
5078           __kmp_set_system_affinity(old_mask, TRUE);
5079           KMP_CPU_FREE(old_mask);
5080         }
5081 #endif
5082 #if KMP_NESTED_HOT_TEAMS
5083       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5084 #endif // KMP_NESTED_HOT_TEAMS
5085       /* make sure everyone is syncronized */
5086       int old_nproc = team->t.t_nproc; // save old value and use to update only
5087       // new threads below
5088       __kmp_initialize_team(team, new_nproc, new_icvs,
5089                             root->r.r_uber_thread->th.th_ident);
5090 
5091       /* reinitialize the threads */
5092       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5093       for (f = 0; f < team->t.t_nproc; ++f)
5094         __kmp_initialize_info(team->t.t_threads[f], team, f,
5095                               __kmp_gtid_from_tid(f, team));
5096       if (level) { // set th_task_state for new threads in nested hot team
5097         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5098         // only need to set the th_task_state for the new threads. th_task_state
5099         // for master thread will not be accurate until after this in
5100         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5101         // correct value.
5102         for (f = old_nproc; f < team->t.t_nproc; ++f)
5103           team->t.t_threads[f]->th.th_task_state =
5104               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5105       } else { // set th_task_state for new threads in non-nested hot team
5106         int old_state =
5107             team->t.t_threads[0]->th.th_task_state; // copy master's state
5108         for (f = old_nproc; f < team->t.t_nproc; ++f)
5109           team->t.t_threads[f]->th.th_task_state = old_state;
5110       }
5111 
5112 #ifdef KMP_DEBUG
5113       for (f = 0; f < team->t.t_nproc; ++f) {
5114         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5115                          team->t.t_threads[f]->th.th_team_nproc ==
5116                              team->t.t_nproc);
5117       }
5118 #endif
5119 
5120 #if OMP_40_ENABLED
5121       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5122 #if KMP_AFFINITY_SUPPORTED
5123       __kmp_partition_places(team);
5124 #endif
5125 #endif
5126     } // Check changes in number of threads
5127 
5128 #if OMP_40_ENABLED
5129     kmp_info_t *master = team->t.t_threads[0];
5130     if (master->th.th_teams_microtask) {
5131       for (f = 1; f < new_nproc; ++f) {
5132         // propagate teams construct specific info to workers
5133         kmp_info_t *thr = team->t.t_threads[f];
5134         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5135         thr->th.th_teams_level = master->th.th_teams_level;
5136         thr->th.th_teams_size = master->th.th_teams_size;
5137       }
5138     }
5139 #endif /* OMP_40_ENABLED */
5140 #if KMP_NESTED_HOT_TEAMS
5141     if (level) {
5142       // Sync barrier state for nested hot teams, not needed for outermost hot
5143       // team.
5144       for (f = 1; f < new_nproc; ++f) {
5145         kmp_info_t *thr = team->t.t_threads[f];
5146         int b;
5147         kmp_balign_t *balign = thr->th.th_bar;
5148         for (b = 0; b < bs_last_barrier; ++b) {
5149           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5150           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5151 #if USE_DEBUGGER
5152           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5153 #endif
5154         }
5155       }
5156     }
5157 #endif // KMP_NESTED_HOT_TEAMS
5158 
5159     /* reallocate space for arguments if necessary */
5160     __kmp_alloc_argv_entries(argc, team, TRUE);
5161     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5162     // The hot team re-uses the previous task team,
5163     // if untouched during the previous release->gather phase.
5164 
5165     KF_TRACE(10, (" hot_team = %p\n", team));
5166 
5167 #if KMP_DEBUG
5168     if (__kmp_tasking_mode != tskm_immediate_exec) {
5169       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5170                     "task_team[1] = %p after reinit\n",
5171                     team->t.t_task_team[0], team->t.t_task_team[1]));
5172     }
5173 #endif
5174 
5175 #if OMPT_SUPPORT
5176     __ompt_team_assign_id(team, ompt_parallel_data);
5177 #endif
5178 
5179     KMP_MB();
5180 
5181     return team;
5182   }
5183 
5184   /* next, let's try to take one from the team pool */
5185   KMP_MB();
5186   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5187     /* TODO: consider resizing undersized teams instead of reaping them, now
5188        that we have a resizing mechanism */
5189     if (team->t.t_max_nproc >= max_nproc) {
5190       /* take this team from the team pool */
5191       __kmp_team_pool = team->t.t_next_pool;
5192 
5193       /* setup the team for fresh use */
5194       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5195 
5196       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5197                     "task_team[1] %p to NULL\n",
5198                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5199       team->t.t_task_team[0] = NULL;
5200       team->t.t_task_team[1] = NULL;
5201 
5202       /* reallocate space for arguments if necessary */
5203       __kmp_alloc_argv_entries(argc, team, TRUE);
5204       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5205 
5206       KA_TRACE(
5207           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5208                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5209       { // Initialize barrier data.
5210         int b;
5211         for (b = 0; b < bs_last_barrier; ++b) {
5212           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5213 #if USE_DEBUGGER
5214           team->t.t_bar[b].b_master_arrived = 0;
5215           team->t.t_bar[b].b_team_arrived = 0;
5216 #endif
5217         }
5218       }
5219 
5220 #if OMP_40_ENABLED
5221       team->t.t_proc_bind = new_proc_bind;
5222 #endif
5223 
5224       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5225                     team->t.t_id));
5226 
5227 #if OMPT_SUPPORT
5228       __ompt_team_assign_id(team, ompt_parallel_data);
5229 #endif
5230 
5231       KMP_MB();
5232 
5233       return team;
5234     }
5235 
5236     /* reap team if it is too small, then loop back and check the next one */
5237     // not sure if this is wise, but, will be redone during the hot-teams
5238     // rewrite.
5239     /* TODO: Use technique to find the right size hot-team, don't reap them */
5240     team = __kmp_reap_team(team);
5241     __kmp_team_pool = team;
5242   }
5243 
5244   /* nothing available in the pool, no matter, make a new team! */
5245   KMP_MB();
5246   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5247 
5248   /* and set it up */
5249   team->t.t_max_nproc = max_nproc;
5250   /* NOTE well, for some reason allocating one big buffer and dividing it up
5251      seems to really hurt performance a lot on the P4, so, let's not use this */
5252   __kmp_allocate_team_arrays(team, max_nproc);
5253 
5254   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5255   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5256 
5257   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5258                 "%p to NULL\n",
5259                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5260   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5261   // memory, no need to duplicate
5262   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5263   // memory, no need to duplicate
5264 
5265   if (__kmp_storage_map) {
5266     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5267   }
5268 
5269   /* allocate space for arguments */
5270   __kmp_alloc_argv_entries(argc, team, FALSE);
5271   team->t.t_argc = argc;
5272 
5273   KA_TRACE(20,
5274            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5275             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5276   { // Initialize barrier data.
5277     int b;
5278     for (b = 0; b < bs_last_barrier; ++b) {
5279       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5280 #if USE_DEBUGGER
5281       team->t.t_bar[b].b_master_arrived = 0;
5282       team->t.t_bar[b].b_team_arrived = 0;
5283 #endif
5284     }
5285   }
5286 
5287 #if OMP_40_ENABLED
5288   team->t.t_proc_bind = new_proc_bind;
5289 #endif
5290 
5291 #if OMPT_SUPPORT
5292   __ompt_team_assign_id(team, ompt_parallel_data);
5293   team->t.ompt_serialized_team_info = NULL;
5294 #endif
5295 
5296   KMP_MB();
5297 
5298   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5299                 team->t.t_id));
5300 
5301   return team;
5302 }
5303 
5304 /* TODO implement hot-teams at all levels */
5305 /* TODO implement lazy thread release on demand (disband request) */
5306 
5307 /* free the team.  return it to the team pool.  release all the threads
5308  * associated with it */
5309 void __kmp_free_team(kmp_root_t *root,
5310                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5311   int f;
5312   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5313                 team->t.t_id));
5314 
5315   /* verify state */
5316   KMP_DEBUG_ASSERT(root);
5317   KMP_DEBUG_ASSERT(team);
5318   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5319   KMP_DEBUG_ASSERT(team->t.t_threads);
5320 
5321   int use_hot_team = team == root->r.r_hot_team;
5322 #if KMP_NESTED_HOT_TEAMS
5323   int level;
5324   kmp_hot_team_ptr_t *hot_teams;
5325   if (master) {
5326     level = team->t.t_active_level - 1;
5327     if (master->th.th_teams_microtask) { // in teams construct?
5328       if (master->th.th_teams_size.nteams > 1) {
5329         ++level; // level was not increased in teams construct for
5330         // team_of_masters
5331       }
5332       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5333           master->th.th_teams_level == team->t.t_level) {
5334         ++level; // level was not increased in teams construct for
5335         // team_of_workers before the parallel
5336       } // team->t.t_level will be increased inside parallel
5337     }
5338     hot_teams = master->th.th_hot_teams;
5339     if (level < __kmp_hot_teams_max_level) {
5340       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5341       use_hot_team = 1;
5342     }
5343   }
5344 #endif // KMP_NESTED_HOT_TEAMS
5345 
5346   /* team is done working */
5347   TCW_SYNC_PTR(team->t.t_pkfn,
5348                NULL); // Important for Debugging Support Library.
5349   team->t.t_copyin_counter = 0; // init counter for possible reuse
5350   // Do not reset pointer to parent team to NULL for hot teams.
5351 
5352   /* if we are non-hot team, release our threads */
5353   if (!use_hot_team) {
5354     if (__kmp_tasking_mode != tskm_immediate_exec) {
5355       // Wait for threads to reach reapable state
5356       for (f = 1; f < team->t.t_nproc; ++f) {
5357         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5358         kmp_info_t *th = team->t.t_threads[f];
5359         volatile kmp_uint32 *state = &th->th.th_reap_state;
5360         while (*state != KMP_SAFE_TO_REAP) {
5361 #if KMP_OS_WINDOWS
5362           // On Windows a thread can be killed at any time, check this
5363           DWORD ecode;
5364           if (!__kmp_is_thread_alive(th, &ecode)) {
5365             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5366             break;
5367           }
5368 #endif
5369           // first check if thread is sleeping
5370           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5371           if (fl.is_sleeping())
5372             fl.resume(__kmp_gtid_from_thread(th));
5373           KMP_CPU_PAUSE();
5374         }
5375       }
5376 
5377       // Delete task teams
5378       int tt_idx;
5379       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5380         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5381         if (task_team != NULL) {
5382           for (f = 0; f < team->t.t_nproc;
5383                ++f) { // Have all threads unref task teams
5384             team->t.t_threads[f]->th.th_task_team = NULL;
5385           }
5386           KA_TRACE(
5387               20,
5388               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5389                __kmp_get_gtid(), task_team, team->t.t_id));
5390 #if KMP_NESTED_HOT_TEAMS
5391           __kmp_free_task_team(master, task_team);
5392 #endif
5393           team->t.t_task_team[tt_idx] = NULL;
5394         }
5395       }
5396     }
5397 
5398     // Reset pointer to parent team only for non-hot teams.
5399     team->t.t_parent = NULL;
5400     team->t.t_level = 0;
5401     team->t.t_active_level = 0;
5402 
5403     /* free the worker threads */
5404     for (f = 1; f < team->t.t_nproc; ++f) {
5405       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5406       __kmp_free_thread(team->t.t_threads[f]);
5407       team->t.t_threads[f] = NULL;
5408     }
5409 
5410     /* put the team back in the team pool */
5411     /* TODO limit size of team pool, call reap_team if pool too large */
5412     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5413     __kmp_team_pool = (volatile kmp_team_t *)team;
5414   }
5415 
5416   KMP_MB();
5417 }
5418 
5419 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5420 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5421   kmp_team_t *next_pool = team->t.t_next_pool;
5422 
5423   KMP_DEBUG_ASSERT(team);
5424   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5425   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5426   KMP_DEBUG_ASSERT(team->t.t_threads);
5427   KMP_DEBUG_ASSERT(team->t.t_argv);
5428 
5429   /* TODO clean the threads that are a part of this? */
5430 
5431   /* free stuff */
5432   __kmp_free_team_arrays(team);
5433   if (team->t.t_argv != &team->t.t_inline_argv[0])
5434     __kmp_free((void *)team->t.t_argv);
5435   __kmp_free(team);
5436 
5437   KMP_MB();
5438   return next_pool;
5439 }
5440 
5441 // Free the thread.  Don't reap it, just place it on the pool of available
5442 // threads.
5443 //
5444 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5445 // binding for the affinity mechanism to be useful.
5446 //
5447 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5448 // However, we want to avoid a potential performance problem by always
5449 // scanning through the list to find the correct point at which to insert
5450 // the thread (potential N**2 behavior).  To do this we keep track of the
5451 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5452 // With single-level parallelism, threads will always be added to the tail
5453 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5454 // parallelism, all bets are off and we may need to scan through the entire
5455 // free list.
5456 //
5457 // This change also has a potentially large performance benefit, for some
5458 // applications.  Previously, as threads were freed from the hot team, they
5459 // would be placed back on the free list in inverse order.  If the hot team
5460 // grew back to it's original size, then the freed thread would be placed
5461 // back on the hot team in reverse order.  This could cause bad cache
5462 // locality problems on programs where the size of the hot team regularly
5463 // grew and shrunk.
5464 //
5465 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5466 void __kmp_free_thread(kmp_info_t *this_th) {
5467   int gtid;
5468   kmp_info_t **scan;
5469   kmp_root_t *root = this_th->th.th_root;
5470 
5471   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5472                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5473 
5474   KMP_DEBUG_ASSERT(this_th);
5475 
5476   // When moving thread to pool, switch thread to wait on own b_go flag, and
5477   // uninitialized (NULL team).
5478   int b;
5479   kmp_balign_t *balign = this_th->th.th_bar;
5480   for (b = 0; b < bs_last_barrier; ++b) {
5481     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5482       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5483     balign[b].bb.team = NULL;
5484     balign[b].bb.leaf_kids = 0;
5485   }
5486   this_th->th.th_task_state = 0;
5487   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5488 
5489   /* put thread back on the free pool */
5490   TCW_PTR(this_th->th.th_team, NULL);
5491   TCW_PTR(this_th->th.th_root, NULL);
5492   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5493 
5494   /* If the implicit task assigned to this thread can be used by other threads
5495    * -> multiple threads can share the data and try to free the task at
5496    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5497    * with higher probability when hot team is disabled but can occurs even when
5498    * the hot team is enabled */
5499   __kmp_free_implicit_task(this_th);
5500   this_th->th.th_current_task = NULL;
5501 
5502   // If the __kmp_thread_pool_insert_pt is already past the new insert
5503   // point, then we need to re-scan the entire list.
5504   gtid = this_th->th.th_info.ds.ds_gtid;
5505   if (__kmp_thread_pool_insert_pt != NULL) {
5506     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5507     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5508       __kmp_thread_pool_insert_pt = NULL;
5509     }
5510   }
5511 
5512   // Scan down the list to find the place to insert the thread.
5513   // scan is the address of a link in the list, possibly the address of
5514   // __kmp_thread_pool itself.
5515   //
5516   // In the absence of nested parallism, the for loop will have 0 iterations.
5517   if (__kmp_thread_pool_insert_pt != NULL) {
5518     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5519   } else {
5520     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5521   }
5522   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5523        scan = &((*scan)->th.th_next_pool))
5524     ;
5525 
5526   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5527   // to its address.
5528   TCW_PTR(this_th->th.th_next_pool, *scan);
5529   __kmp_thread_pool_insert_pt = *scan = this_th;
5530   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5531                    (this_th->th.th_info.ds.ds_gtid <
5532                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5533   TCW_4(this_th->th.th_in_pool, TRUE);
5534   __kmp_thread_pool_nth++;
5535 
5536   TCW_4(__kmp_nth, __kmp_nth - 1);
5537   root->r.r_cg_nthreads--;
5538 
5539 #ifdef KMP_ADJUST_BLOCKTIME
5540   /* Adjust blocktime back to user setting or default if necessary */
5541   /* Middle initialization might never have occurred                */
5542   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5543     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5544     if (__kmp_nth <= __kmp_avail_proc) {
5545       __kmp_zero_bt = FALSE;
5546     }
5547   }
5548 #endif /* KMP_ADJUST_BLOCKTIME */
5549 
5550   KMP_MB();
5551 }
5552 
5553 /* ------------------------------------------------------------------------ */
5554 
5555 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5556   int gtid = this_thr->th.th_info.ds.ds_gtid;
5557   /*    void                 *stack_data;*/
5558   kmp_team_t *(*volatile pteam);
5559 
5560   KMP_MB();
5561   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5562 
5563   if (__kmp_env_consistency_check) {
5564     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5565   }
5566 
5567 #if OMPT_SUPPORT
5568   ompt_data_t *thread_data;
5569   if (ompt_enabled.enabled) {
5570     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5571     thread_data->ptr = NULL;
5572 
5573     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5574     this_thr->th.ompt_thread_info.wait_id = 0;
5575     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5576     if (ompt_enabled.ompt_callback_thread_begin) {
5577       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5578           ompt_thread_worker, thread_data);
5579     }
5580   }
5581 #endif
5582 
5583 #if OMPT_SUPPORT
5584   if (ompt_enabled.enabled) {
5585     this_thr->th.ompt_thread_info.state = omp_state_idle;
5586   }
5587 #endif
5588   /* This is the place where threads wait for work */
5589   while (!TCR_4(__kmp_global.g.g_done)) {
5590     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5591     KMP_MB();
5592 
5593     /* wait for work to do */
5594     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5595 
5596     /* No tid yet since not part of a team */
5597     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5598 
5599 #if OMPT_SUPPORT
5600     if (ompt_enabled.enabled) {
5601       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5602     }
5603 #endif
5604 
5605     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5606 
5607     /* have we been allocated? */
5608     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5609       /* we were just woken up, so run our new task */
5610       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5611         int rc;
5612         KA_TRACE(20,
5613                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5614                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5615                   (*pteam)->t.t_pkfn));
5616 
5617         updateHWFPControl(*pteam);
5618 
5619 #if OMPT_SUPPORT
5620         if (ompt_enabled.enabled) {
5621           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5622         }
5623 #endif
5624 
5625         {
5626           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5627           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5628           rc = (*pteam)->t.t_invoke(gtid);
5629         }
5630         KMP_ASSERT(rc);
5631 
5632         KMP_MB();
5633         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5634                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5635                       (*pteam)->t.t_pkfn));
5636       }
5637 #if OMPT_SUPPORT
5638       if (ompt_enabled.enabled) {
5639         /* no frame set while outside task */
5640         __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5641 
5642         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5643       }
5644 #endif
5645       /* join barrier after parallel region */
5646       __kmp_join_barrier(gtid);
5647     }
5648   }
5649   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5650 
5651 #if OMPT_SUPPORT
5652   if (ompt_enabled.ompt_callback_thread_end) {
5653     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5654   }
5655 #endif
5656 
5657   this_thr->th.th_task_team = NULL;
5658   /* run the destructors for the threadprivate data for this thread */
5659   __kmp_common_destroy_gtid(gtid);
5660 
5661   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5662   KMP_MB();
5663   return this_thr;
5664 }
5665 
5666 /* ------------------------------------------------------------------------ */
5667 
5668 void __kmp_internal_end_dest(void *specific_gtid) {
5669 #if KMP_COMPILER_ICC
5670 #pragma warning(push)
5671 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5672 // significant bits
5673 #endif
5674   // Make sure no significant bits are lost
5675   int gtid = (kmp_intptr_t)specific_gtid - 1;
5676 #if KMP_COMPILER_ICC
5677 #pragma warning(pop)
5678 #endif
5679 
5680   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5681   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5682    * this is because 0 is reserved for the nothing-stored case */
5683 
5684   /* josh: One reason for setting the gtid specific data even when it is being
5685      destroyed by pthread is to allow gtid lookup through thread specific data
5686      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5687      that gets executed in the call to __kmp_internal_end_thread, actually
5688      gets the gtid through the thread specific data.  Setting it here seems
5689      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5690      to run smoothly.
5691      todo: get rid of this after we remove the dependence on
5692      __kmp_gtid_get_specific  */
5693   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5694     __kmp_gtid_set_specific(gtid);
5695 #ifdef KMP_TDATA_GTID
5696   __kmp_gtid = gtid;
5697 #endif
5698   __kmp_internal_end_thread(gtid);
5699 }
5700 
5701 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5702 
5703 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5704 // destructors work perfectly, but in real libomp.so I have no evidence it is
5705 // ever called. However, -fini linker option in makefile.mk works fine.
5706 
5707 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5708   __kmp_internal_end_atexit();
5709 }
5710 
5711 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5712 
5713 #endif
5714 
5715 /* [Windows] josh: when the atexit handler is called, there may still be more
5716    than one thread alive */
5717 void __kmp_internal_end_atexit(void) {
5718   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5719   /* [Windows]
5720      josh: ideally, we want to completely shutdown the library in this atexit
5721      handler, but stat code that depends on thread specific data for gtid fails
5722      because that data becomes unavailable at some point during the shutdown, so
5723      we call __kmp_internal_end_thread instead. We should eventually remove the
5724      dependency on __kmp_get_specific_gtid in the stat code and use
5725      __kmp_internal_end_library to cleanly shutdown the library.
5726 
5727      // TODO: Can some of this comment about GVS be removed?
5728      I suspect that the offending stat code is executed when the calling thread
5729      tries to clean up a dead root thread's data structures, resulting in GVS
5730      code trying to close the GVS structures for that thread, but since the stat
5731      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5732      the calling thread is cleaning up itself instead of another thread, it get
5733      confused. This happens because allowing a thread to unregister and cleanup
5734      another thread is a recent modification for addressing an issue.
5735      Based on the current design (20050722), a thread may end up
5736      trying to unregister another thread only if thread death does not trigger
5737      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5738      thread specific data destructor function to detect thread death. For
5739      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5740      is nothing.  Thus, the workaround is applicable only for Windows static
5741      stat library. */
5742   __kmp_internal_end_library(-1);
5743 #if KMP_OS_WINDOWS
5744   __kmp_close_console();
5745 #endif
5746 }
5747 
5748 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5749   // It is assumed __kmp_forkjoin_lock is acquired.
5750 
5751   int gtid;
5752 
5753   KMP_DEBUG_ASSERT(thread != NULL);
5754 
5755   gtid = thread->th.th_info.ds.ds_gtid;
5756 
5757   if (!is_root) {
5758 
5759     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5760       /* Assume the threads are at the fork barrier here */
5761       KA_TRACE(
5762           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5763                gtid));
5764       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5765        * (GEH) */
5766       ANNOTATE_HAPPENS_BEFORE(thread);
5767       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5768       __kmp_release_64(&flag);
5769     }
5770 
5771     // Terminate OS thread.
5772     __kmp_reap_worker(thread);
5773 
5774     // The thread was killed asynchronously.  If it was actively
5775     // spinning in the thread pool, decrement the global count.
5776     //
5777     // There is a small timing hole here - if the worker thread was just waking
5778     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5779     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5780     // the global counter might not get updated.
5781     //
5782     // Currently, this can only happen as the library is unloaded,
5783     // so there are no harmful side effects.
5784     if (thread->th.th_active_in_pool) {
5785       thread->th.th_active_in_pool = FALSE;
5786       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5787       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5788     }
5789 
5790     // Decrement # of [worker] threads in the pool.
5791     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5792     --__kmp_thread_pool_nth;
5793   }
5794 
5795   __kmp_free_implicit_task(thread);
5796 
5797 // Free the fast memory for tasking
5798 #if USE_FAST_MEMORY
5799   __kmp_free_fast_memory(thread);
5800 #endif /* USE_FAST_MEMORY */
5801 
5802   __kmp_suspend_uninitialize_thread(thread);
5803 
5804   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5805   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5806 
5807   --__kmp_all_nth;
5808 // __kmp_nth was decremented when thread is added to the pool.
5809 
5810 #ifdef KMP_ADJUST_BLOCKTIME
5811   /* Adjust blocktime back to user setting or default if necessary */
5812   /* Middle initialization might never have occurred                */
5813   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5814     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5815     if (__kmp_nth <= __kmp_avail_proc) {
5816       __kmp_zero_bt = FALSE;
5817     }
5818   }
5819 #endif /* KMP_ADJUST_BLOCKTIME */
5820 
5821   /* free the memory being used */
5822   if (__kmp_env_consistency_check) {
5823     if (thread->th.th_cons) {
5824       __kmp_free_cons_stack(thread->th.th_cons);
5825       thread->th.th_cons = NULL;
5826     }
5827   }
5828 
5829   if (thread->th.th_pri_common != NULL) {
5830     __kmp_free(thread->th.th_pri_common);
5831     thread->th.th_pri_common = NULL;
5832   }
5833 
5834   if (thread->th.th_task_state_memo_stack != NULL) {
5835     __kmp_free(thread->th.th_task_state_memo_stack);
5836     thread->th.th_task_state_memo_stack = NULL;
5837   }
5838 
5839 #if KMP_USE_BGET
5840   if (thread->th.th_local.bget_data != NULL) {
5841     __kmp_finalize_bget(thread);
5842   }
5843 #endif
5844 
5845 #if KMP_AFFINITY_SUPPORTED
5846   if (thread->th.th_affin_mask != NULL) {
5847     KMP_CPU_FREE(thread->th.th_affin_mask);
5848     thread->th.th_affin_mask = NULL;
5849   }
5850 #endif /* KMP_AFFINITY_SUPPORTED */
5851 
5852   __kmp_reap_team(thread->th.th_serial_team);
5853   thread->th.th_serial_team = NULL;
5854   __kmp_free(thread);
5855 
5856   KMP_MB();
5857 
5858 } // __kmp_reap_thread
5859 
5860 static void __kmp_internal_end(void) {
5861   int i;
5862 
5863   /* First, unregister the library */
5864   __kmp_unregister_library();
5865 
5866 #if KMP_OS_WINDOWS
5867   /* In Win static library, we can't tell when a root actually dies, so we
5868      reclaim the data structures for any root threads that have died but not
5869      unregistered themselves, in order to shut down cleanly.
5870      In Win dynamic library we also can't tell when a thread dies.  */
5871   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5872 // dead roots
5873 #endif
5874 
5875   for (i = 0; i < __kmp_threads_capacity; i++)
5876     if (__kmp_root[i])
5877       if (__kmp_root[i]->r.r_active)
5878         break;
5879   KMP_MB(); /* Flush all pending memory write invalidates.  */
5880   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5881 
5882   if (i < __kmp_threads_capacity) {
5883 #if KMP_USE_MONITOR
5884     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5885     KMP_MB(); /* Flush all pending memory write invalidates.  */
5886 
5887     // Need to check that monitor was initialized before reaping it. If we are
5888     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5889     // __kmp_monitor will appear to contain valid data, but it is only valid in
5890     // the parent process, not the child.
5891     // New behavior (201008): instead of keying off of the flag
5892     // __kmp_init_parallel, the monitor thread creation is keyed off
5893     // of the new flag __kmp_init_monitor.
5894     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5895     if (TCR_4(__kmp_init_monitor)) {
5896       __kmp_reap_monitor(&__kmp_monitor);
5897       TCW_4(__kmp_init_monitor, 0);
5898     }
5899     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5900     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5901 #endif // KMP_USE_MONITOR
5902   } else {
5903 /* TODO move this to cleanup code */
5904 #ifdef KMP_DEBUG
5905     /* make sure that everything has properly ended */
5906     for (i = 0; i < __kmp_threads_capacity; i++) {
5907       if (__kmp_root[i]) {
5908         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5909         //                    there can be uber threads alive here
5910         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5911       }
5912     }
5913 #endif
5914 
5915     KMP_MB();
5916 
5917     // Reap the worker threads.
5918     // This is valid for now, but be careful if threads are reaped sooner.
5919     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5920       // Get the next thread from the pool.
5921       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5922       __kmp_thread_pool = thread->th.th_next_pool;
5923       // Reap it.
5924       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5925       thread->th.th_next_pool = NULL;
5926       thread->th.th_in_pool = FALSE;
5927       __kmp_reap_thread(thread, 0);
5928     }
5929     __kmp_thread_pool_insert_pt = NULL;
5930 
5931     // Reap teams.
5932     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5933       // Get the next team from the pool.
5934       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5935       __kmp_team_pool = team->t.t_next_pool;
5936       // Reap it.
5937       team->t.t_next_pool = NULL;
5938       __kmp_reap_team(team);
5939     }
5940 
5941     __kmp_reap_task_teams();
5942 
5943     for (i = 0; i < __kmp_threads_capacity; ++i) {
5944       // TBD: Add some checking...
5945       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5946     }
5947 
5948     /* Make sure all threadprivate destructors get run by joining with all
5949        worker threads before resetting this flag */
5950     TCW_SYNC_4(__kmp_init_common, FALSE);
5951 
5952     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5953     KMP_MB();
5954 
5955 #if KMP_USE_MONITOR
5956     // See note above: One of the possible fixes for CQ138434 / CQ140126
5957     //
5958     // FIXME: push both code fragments down and CSE them?
5959     // push them into __kmp_cleanup() ?
5960     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5961     if (TCR_4(__kmp_init_monitor)) {
5962       __kmp_reap_monitor(&__kmp_monitor);
5963       TCW_4(__kmp_init_monitor, 0);
5964     }
5965     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5966     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5967 #endif
5968   } /* else !__kmp_global.t_active */
5969   TCW_4(__kmp_init_gtid, FALSE);
5970   KMP_MB(); /* Flush all pending memory write invalidates.  */
5971 
5972   __kmp_cleanup();
5973 #if OMPT_SUPPORT
5974   ompt_fini();
5975 #endif
5976 }
5977 
5978 void __kmp_internal_end_library(int gtid_req) {
5979   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5980   /* this shouldn't be a race condition because __kmp_internal_end() is the
5981      only place to clear __kmp_serial_init */
5982   /* we'll check this later too, after we get the lock */
5983   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5984   // redundaant, because the next check will work in any case.
5985   if (__kmp_global.g.g_abort) {
5986     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5987     /* TODO abort? */
5988     return;
5989   }
5990   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5991     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5992     return;
5993   }
5994 
5995   KMP_MB(); /* Flush all pending memory write invalidates.  */
5996 
5997   /* find out who we are and what we should do */
5998   {
5999     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6000     KA_TRACE(
6001         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6002     if (gtid == KMP_GTID_SHUTDOWN) {
6003       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6004                     "already shutdown\n"));
6005       return;
6006     } else if (gtid == KMP_GTID_MONITOR) {
6007       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6008                     "registered, or system shutdown\n"));
6009       return;
6010     } else if (gtid == KMP_GTID_DNE) {
6011       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6012                     "shutdown\n"));
6013       /* we don't know who we are, but we may still shutdown the library */
6014     } else if (KMP_UBER_GTID(gtid)) {
6015       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6016       if (__kmp_root[gtid]->r.r_active) {
6017         __kmp_global.g.g_abort = -1;
6018         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6019         KA_TRACE(10,
6020                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6021                   gtid));
6022         return;
6023       } else {
6024         KA_TRACE(
6025             10,
6026             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6027         __kmp_unregister_root_current_thread(gtid);
6028       }
6029     } else {
6030 /* worker threads may call this function through the atexit handler, if they
6031  * call exit() */
6032 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6033    TODO: do a thorough shutdown instead */
6034 #ifdef DUMP_DEBUG_ON_EXIT
6035       if (__kmp_debug_buf)
6036         __kmp_dump_debug_buffer();
6037 #endif
6038       return;
6039     }
6040   }
6041   /* synchronize the termination process */
6042   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6043 
6044   /* have we already finished */
6045   if (__kmp_global.g.g_abort) {
6046     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6047     /* TODO abort? */
6048     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6049     return;
6050   }
6051   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6052     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6053     return;
6054   }
6055 
6056   /* We need this lock to enforce mutex between this reading of
6057      __kmp_threads_capacity and the writing by __kmp_register_root.
6058      Alternatively, we can use a counter of roots that is atomically updated by
6059      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6060      __kmp_internal_end_*.  */
6061   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6062 
6063   /* now we can safely conduct the actual termination */
6064   __kmp_internal_end();
6065 
6066   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6067   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6068 
6069   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6070 
6071 #ifdef DUMP_DEBUG_ON_EXIT
6072   if (__kmp_debug_buf)
6073     __kmp_dump_debug_buffer();
6074 #endif
6075 
6076 #if KMP_OS_WINDOWS
6077   __kmp_close_console();
6078 #endif
6079 
6080   __kmp_fini_allocator();
6081 
6082 } // __kmp_internal_end_library
6083 
6084 void __kmp_internal_end_thread(int gtid_req) {
6085   int i;
6086 
6087   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6088   /* this shouldn't be a race condition because __kmp_internal_end() is the
6089    * only place to clear __kmp_serial_init */
6090   /* we'll check this later too, after we get the lock */
6091   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6092   // redundant, because the next check will work in any case.
6093   if (__kmp_global.g.g_abort) {
6094     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6095     /* TODO abort? */
6096     return;
6097   }
6098   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6099     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6100     return;
6101   }
6102 
6103   KMP_MB(); /* Flush all pending memory write invalidates.  */
6104 
6105   /* find out who we are and what we should do */
6106   {
6107     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6108     KA_TRACE(10,
6109              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6110     if (gtid == KMP_GTID_SHUTDOWN) {
6111       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6112                     "already shutdown\n"));
6113       return;
6114     } else if (gtid == KMP_GTID_MONITOR) {
6115       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6116                     "registered, or system shutdown\n"));
6117       return;
6118     } else if (gtid == KMP_GTID_DNE) {
6119       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6120                     "shutdown\n"));
6121       return;
6122       /* we don't know who we are */
6123     } else if (KMP_UBER_GTID(gtid)) {
6124       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6125       if (__kmp_root[gtid]->r.r_active) {
6126         __kmp_global.g.g_abort = -1;
6127         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6128         KA_TRACE(10,
6129                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6130                   gtid));
6131         return;
6132       } else {
6133         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6134                       gtid));
6135         __kmp_unregister_root_current_thread(gtid);
6136       }
6137     } else {
6138       /* just a worker thread, let's leave */
6139       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6140 
6141       if (gtid >= 0) {
6142         __kmp_threads[gtid]->th.th_task_team = NULL;
6143       }
6144 
6145       KA_TRACE(10,
6146                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6147                 gtid));
6148       return;
6149     }
6150   }
6151 #if defined KMP_DYNAMIC_LIB
6152   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6153   // thread, because we will better shutdown later in the library destructor.
6154   // The reason of this change is performance problem when non-openmp thread in
6155   // a loop forks and joins many openmp threads. We can save a lot of time
6156   // keeping worker threads alive until the program shutdown.
6157   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6158   // and Windows(DPD200287443) that occurs when using critical sections from
6159   // foreign threads.
6160   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6161   return;
6162 #endif
6163   /* synchronize the termination process */
6164   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6165 
6166   /* have we already finished */
6167   if (__kmp_global.g.g_abort) {
6168     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6169     /* TODO abort? */
6170     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6171     return;
6172   }
6173   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6174     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6175     return;
6176   }
6177 
6178   /* We need this lock to enforce mutex between this reading of
6179      __kmp_threads_capacity and the writing by __kmp_register_root.
6180      Alternatively, we can use a counter of roots that is atomically updated by
6181      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6182      __kmp_internal_end_*.  */
6183 
6184   /* should we finish the run-time?  are all siblings done? */
6185   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6186 
6187   for (i = 0; i < __kmp_threads_capacity; ++i) {
6188     if (KMP_UBER_GTID(i)) {
6189       KA_TRACE(
6190           10,
6191           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6192       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6193       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6194       return;
6195     }
6196   }
6197 
6198   /* now we can safely conduct the actual termination */
6199 
6200   __kmp_internal_end();
6201 
6202   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6203   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6204 
6205   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6206 
6207 #ifdef DUMP_DEBUG_ON_EXIT
6208   if (__kmp_debug_buf)
6209     __kmp_dump_debug_buffer();
6210 #endif
6211 } // __kmp_internal_end_thread
6212 
6213 // -----------------------------------------------------------------------------
6214 // Library registration stuff.
6215 
6216 static long __kmp_registration_flag = 0;
6217 // Random value used to indicate library initialization.
6218 static char *__kmp_registration_str = NULL;
6219 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6220 
6221 static inline char *__kmp_reg_status_name() {
6222   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6223      each thread. If registration and unregistration go in different threads
6224      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6225      env var can not be found, because the name will contain different pid. */
6226   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6227 } // __kmp_reg_status_get
6228 
6229 void __kmp_register_library_startup(void) {
6230 
6231   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6232   int done = 0;
6233   union {
6234     double dtime;
6235     long ltime;
6236   } time;
6237 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6238   __kmp_initialize_system_tick();
6239 #endif
6240   __kmp_read_system_time(&time.dtime);
6241   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6242   __kmp_registration_str =
6243       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6244                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6245 
6246   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6247                 __kmp_registration_str));
6248 
6249   while (!done) {
6250 
6251     char *value = NULL; // Actual value of the environment variable.
6252 
6253     // Set environment variable, but do not overwrite if it is exist.
6254     __kmp_env_set(name, __kmp_registration_str, 0);
6255     // Check the variable is written.
6256     value = __kmp_env_get(name);
6257     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6258 
6259       done = 1; // Ok, environment variable set successfully, exit the loop.
6260 
6261     } else {
6262 
6263       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6264       // Check whether it alive or dead.
6265       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6266       char *tail = value;
6267       char *flag_addr_str = NULL;
6268       char *flag_val_str = NULL;
6269       char const *file_name = NULL;
6270       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6271       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6272       file_name = tail;
6273       if (tail != NULL) {
6274         long *flag_addr = 0;
6275         long flag_val = 0;
6276         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6277         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6278         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6279           // First, check whether environment-encoded address is mapped into
6280           // addr space.
6281           // If so, dereference it to see if it still has the right value.
6282           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6283             neighbor = 1;
6284           } else {
6285             // If not, then we know the other copy of the library is no longer
6286             // running.
6287             neighbor = 2;
6288           }
6289         }
6290       }
6291       switch (neighbor) {
6292       case 0: // Cannot parse environment variable -- neighbor status unknown.
6293         // Assume it is the incompatible format of future version of the
6294         // library. Assume the other library is alive.
6295         // WARN( ... ); // TODO: Issue a warning.
6296         file_name = "unknown library";
6297       // Attention! Falling to the next case. That's intentional.
6298       case 1: { // Neighbor is alive.
6299         // Check it is allowed.
6300         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6301         if (!__kmp_str_match_true(duplicate_ok)) {
6302           // That's not allowed. Issue fatal error.
6303           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6304                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6305         }
6306         KMP_INTERNAL_FREE(duplicate_ok);
6307         __kmp_duplicate_library_ok = 1;
6308         done = 1; // Exit the loop.
6309       } break;
6310       case 2: { // Neighbor is dead.
6311         // Clear the variable and try to register library again.
6312         __kmp_env_unset(name);
6313       } break;
6314       default: { KMP_DEBUG_ASSERT(0); } break;
6315       }
6316     }
6317     KMP_INTERNAL_FREE((void *)value);
6318   }
6319   KMP_INTERNAL_FREE((void *)name);
6320 
6321 } // func __kmp_register_library_startup
6322 
6323 void __kmp_unregister_library(void) {
6324 
6325   char *name = __kmp_reg_status_name();
6326   char *value = __kmp_env_get(name);
6327 
6328   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6329   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6330   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6331     // Ok, this is our variable. Delete it.
6332     __kmp_env_unset(name);
6333   }
6334 
6335   KMP_INTERNAL_FREE(__kmp_registration_str);
6336   KMP_INTERNAL_FREE(value);
6337   KMP_INTERNAL_FREE(name);
6338 
6339   __kmp_registration_flag = 0;
6340   __kmp_registration_str = NULL;
6341 
6342 } // __kmp_unregister_library
6343 
6344 // End of Library registration stuff.
6345 // -----------------------------------------------------------------------------
6346 
6347 #if KMP_MIC_SUPPORTED
6348 
6349 static void __kmp_check_mic_type() {
6350   kmp_cpuid_t cpuid_state = {0};
6351   kmp_cpuid_t *cs_p = &cpuid_state;
6352   __kmp_x86_cpuid(1, 0, cs_p);
6353   // We don't support mic1 at the moment
6354   if ((cs_p->eax & 0xff0) == 0xB10) {
6355     __kmp_mic_type = mic2;
6356   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6357     __kmp_mic_type = mic3;
6358   } else {
6359     __kmp_mic_type = non_mic;
6360   }
6361 }
6362 
6363 #endif /* KMP_MIC_SUPPORTED */
6364 
6365 static void __kmp_do_serial_initialize(void) {
6366   int i, gtid;
6367   int size;
6368 
6369   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6370 
6371   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6372   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6373   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6374   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6375   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6376 
6377 #if OMPT_SUPPORT
6378   ompt_pre_init();
6379 #endif
6380 
6381   __kmp_validate_locks();
6382 
6383   /* Initialize internal memory allocator */
6384   __kmp_init_allocator();
6385 
6386   /* Register the library startup via an environment variable and check to see
6387      whether another copy of the library is already registered. */
6388 
6389   __kmp_register_library_startup();
6390 
6391   /* TODO reinitialization of library */
6392   if (TCR_4(__kmp_global.g.g_done)) {
6393     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6394   }
6395 
6396   __kmp_global.g.g_abort = 0;
6397   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6398 
6399 /* initialize the locks */
6400 #if KMP_USE_ADAPTIVE_LOCKS
6401 #if KMP_DEBUG_ADAPTIVE_LOCKS
6402   __kmp_init_speculative_stats();
6403 #endif
6404 #endif
6405 #if KMP_STATS_ENABLED
6406   __kmp_stats_init();
6407 #endif
6408   __kmp_init_lock(&__kmp_global_lock);
6409   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6410   __kmp_init_lock(&__kmp_debug_lock);
6411   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6412   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6413   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6414   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6415   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6416   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6417   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6418   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6419   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6420   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6421   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6422   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6423   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6424   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6425   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6426 #if KMP_USE_MONITOR
6427   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6428 #endif
6429   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6430 
6431   /* conduct initialization and initial setup of configuration */
6432 
6433   __kmp_runtime_initialize();
6434 
6435 #if KMP_MIC_SUPPORTED
6436   __kmp_check_mic_type();
6437 #endif
6438 
6439 // Some global variable initialization moved here from kmp_env_initialize()
6440 #ifdef KMP_DEBUG
6441   kmp_diag = 0;
6442 #endif
6443   __kmp_abort_delay = 0;
6444 
6445   // From __kmp_init_dflt_team_nth()
6446   /* assume the entire machine will be used */
6447   __kmp_dflt_team_nth_ub = __kmp_xproc;
6448   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6449     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6450   }
6451   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6452     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6453   }
6454   __kmp_max_nth = __kmp_sys_max_nth;
6455   __kmp_cg_max_nth = __kmp_sys_max_nth;
6456   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6457   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6458     __kmp_teams_max_nth = __kmp_sys_max_nth;
6459   }
6460 
6461   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6462   // part
6463   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6464 #if KMP_USE_MONITOR
6465   __kmp_monitor_wakeups =
6466       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6467   __kmp_bt_intervals =
6468       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6469 #endif
6470   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6471   __kmp_library = library_throughput;
6472   // From KMP_SCHEDULE initialization
6473   __kmp_static = kmp_sch_static_balanced;
6474 // AC: do not use analytical here, because it is non-monotonous
6475 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6476 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6477 // need to repeat assignment
6478 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6479 // bit control and barrier method control parts
6480 #if KMP_FAST_REDUCTION_BARRIER
6481 #define kmp_reduction_barrier_gather_bb ((int)1)
6482 #define kmp_reduction_barrier_release_bb ((int)1)
6483 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6484 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6485 #endif // KMP_FAST_REDUCTION_BARRIER
6486   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6487     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6488     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6489     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6490     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6491 #if KMP_FAST_REDUCTION_BARRIER
6492     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6493       // lin_64 ): hyper,1
6494       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6495       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6496       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6497       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6498     }
6499 #endif // KMP_FAST_REDUCTION_BARRIER
6500   }
6501 #if KMP_FAST_REDUCTION_BARRIER
6502 #undef kmp_reduction_barrier_release_pat
6503 #undef kmp_reduction_barrier_gather_pat
6504 #undef kmp_reduction_barrier_release_bb
6505 #undef kmp_reduction_barrier_gather_bb
6506 #endif // KMP_FAST_REDUCTION_BARRIER
6507 #if KMP_MIC_SUPPORTED
6508   if (__kmp_mic_type == mic2) { // KNC
6509     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6510     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6511     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6512         1; // forkjoin release
6513     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6514     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6515   }
6516 #if KMP_FAST_REDUCTION_BARRIER
6517   if (__kmp_mic_type == mic2) { // KNC
6518     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6519     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6520   }
6521 #endif // KMP_FAST_REDUCTION_BARRIER
6522 #endif // KMP_MIC_SUPPORTED
6523 
6524 // From KMP_CHECKS initialization
6525 #ifdef KMP_DEBUG
6526   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6527 #else
6528   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6529 #endif
6530 
6531   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6532   __kmp_foreign_tp = TRUE;
6533 
6534   __kmp_global.g.g_dynamic = FALSE;
6535   __kmp_global.g.g_dynamic_mode = dynamic_default;
6536 
6537   __kmp_env_initialize(NULL);
6538 
6539 // Print all messages in message catalog for testing purposes.
6540 #ifdef KMP_DEBUG
6541   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6542   if (__kmp_str_match_true(val)) {
6543     kmp_str_buf_t buffer;
6544     __kmp_str_buf_init(&buffer);
6545     __kmp_i18n_dump_catalog(&buffer);
6546     __kmp_printf("%s", buffer.str);
6547     __kmp_str_buf_free(&buffer);
6548   }
6549   __kmp_env_free(&val);
6550 #endif
6551 
6552   __kmp_threads_capacity =
6553       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6554   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6555   __kmp_tp_capacity = __kmp_default_tp_capacity(
6556       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6557 
6558   // If the library is shut down properly, both pools must be NULL. Just in
6559   // case, set them to NULL -- some memory may leak, but subsequent code will
6560   // work even if pools are not freed.
6561   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6562   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6563   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6564   __kmp_thread_pool = NULL;
6565   __kmp_thread_pool_insert_pt = NULL;
6566   __kmp_team_pool = NULL;
6567 
6568   /* Allocate all of the variable sized records */
6569   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6570    * expandable */
6571   /* Since allocation is cache-aligned, just add extra padding at the end */
6572   size =
6573       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6574       CACHE_LINE;
6575   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6576   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6577                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6578 
6579   /* init thread counts */
6580   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6581                    0); // Asserts fail if the library is reinitializing and
6582   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6583   __kmp_all_nth = 0;
6584   __kmp_nth = 0;
6585 
6586   /* setup the uber master thread and hierarchy */
6587   gtid = __kmp_register_root(TRUE);
6588   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6589   KMP_ASSERT(KMP_UBER_GTID(gtid));
6590   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6591 
6592   KMP_MB(); /* Flush all pending memory write invalidates.  */
6593 
6594   __kmp_common_initialize();
6595 
6596 #if KMP_OS_UNIX
6597   /* invoke the child fork handler */
6598   __kmp_register_atfork();
6599 #endif
6600 
6601 #if !defined KMP_DYNAMIC_LIB
6602   {
6603     /* Invoke the exit handler when the program finishes, only for static
6604        library. For dynamic library, we already have _fini and DllMain. */
6605     int rc = atexit(__kmp_internal_end_atexit);
6606     if (rc != 0) {
6607       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6608                   __kmp_msg_null);
6609     }
6610   }
6611 #endif
6612 
6613 #if KMP_HANDLE_SIGNALS
6614 #if KMP_OS_UNIX
6615   /* NOTE: make sure that this is called before the user installs their own
6616      signal handlers so that the user handlers are called first. this way they
6617      can return false, not call our handler, avoid terminating the library, and
6618      continue execution where they left off. */
6619   __kmp_install_signals(FALSE);
6620 #endif /* KMP_OS_UNIX */
6621 #if KMP_OS_WINDOWS
6622   __kmp_install_signals(TRUE);
6623 #endif /* KMP_OS_WINDOWS */
6624 #endif
6625 
6626   /* we have finished the serial initialization */
6627   __kmp_init_counter++;
6628 
6629   __kmp_init_serial = TRUE;
6630 
6631   if (__kmp_settings) {
6632     __kmp_env_print();
6633   }
6634 
6635 #if OMP_40_ENABLED
6636   if (__kmp_display_env || __kmp_display_env_verbose) {
6637     __kmp_env_print_2();
6638   }
6639 #endif // OMP_40_ENABLED
6640 
6641 #if OMPT_SUPPORT
6642   ompt_post_init();
6643 #endif
6644 
6645   KMP_MB();
6646 
6647   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6648 }
6649 
6650 void __kmp_serial_initialize(void) {
6651   if (__kmp_init_serial) {
6652     return;
6653   }
6654   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6655   if (__kmp_init_serial) {
6656     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657     return;
6658   }
6659   __kmp_do_serial_initialize();
6660   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6661 }
6662 
6663 static void __kmp_do_middle_initialize(void) {
6664   int i, j;
6665   int prev_dflt_team_nth;
6666 
6667   if (!__kmp_init_serial) {
6668     __kmp_do_serial_initialize();
6669   }
6670 
6671   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6672 
6673   // Save the previous value for the __kmp_dflt_team_nth so that
6674   // we can avoid some reinitialization if it hasn't changed.
6675   prev_dflt_team_nth = __kmp_dflt_team_nth;
6676 
6677 #if KMP_AFFINITY_SUPPORTED
6678   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6679   // number of cores on the machine.
6680   __kmp_affinity_initialize();
6681 
6682   // Run through the __kmp_threads array and set the affinity mask
6683   // for each root thread that is currently registered with the RTL.
6684   for (i = 0; i < __kmp_threads_capacity; i++) {
6685     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6686       __kmp_affinity_set_init_mask(i, TRUE);
6687     }
6688   }
6689 #endif /* KMP_AFFINITY_SUPPORTED */
6690 
6691   KMP_ASSERT(__kmp_xproc > 0);
6692   if (__kmp_avail_proc == 0) {
6693     __kmp_avail_proc = __kmp_xproc;
6694   }
6695 
6696   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6697   // correct them now
6698   j = 0;
6699   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6700     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6701         __kmp_avail_proc;
6702     j++;
6703   }
6704 
6705   if (__kmp_dflt_team_nth == 0) {
6706 #ifdef KMP_DFLT_NTH_CORES
6707     // Default #threads = #cores
6708     __kmp_dflt_team_nth = __kmp_ncores;
6709     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6710                   "__kmp_ncores (%d)\n",
6711                   __kmp_dflt_team_nth));
6712 #else
6713     // Default #threads = #available OS procs
6714     __kmp_dflt_team_nth = __kmp_avail_proc;
6715     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6716                   "__kmp_avail_proc(%d)\n",
6717                   __kmp_dflt_team_nth));
6718 #endif /* KMP_DFLT_NTH_CORES */
6719   }
6720 
6721   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6722     __kmp_dflt_team_nth = KMP_MIN_NTH;
6723   }
6724   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6725     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6726   }
6727 
6728   // There's no harm in continuing if the following check fails,
6729   // but it indicates an error in the previous logic.
6730   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6731 
6732   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6733     // Run through the __kmp_threads array and set the num threads icv for each
6734     // root thread that is currently registered with the RTL (which has not
6735     // already explicitly set its nthreads-var with a call to
6736     // omp_set_num_threads()).
6737     for (i = 0; i < __kmp_threads_capacity; i++) {
6738       kmp_info_t *thread = __kmp_threads[i];
6739       if (thread == NULL)
6740         continue;
6741       if (thread->th.th_current_task->td_icvs.nproc != 0)
6742         continue;
6743 
6744       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6745     }
6746   }
6747   KA_TRACE(
6748       20,
6749       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6750        __kmp_dflt_team_nth));
6751 
6752 #ifdef KMP_ADJUST_BLOCKTIME
6753   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6754   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6755     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6756     if (__kmp_nth > __kmp_avail_proc) {
6757       __kmp_zero_bt = TRUE;
6758     }
6759   }
6760 #endif /* KMP_ADJUST_BLOCKTIME */
6761 
6762   /* we have finished middle initialization */
6763   TCW_SYNC_4(__kmp_init_middle, TRUE);
6764 
6765   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6766 }
6767 
6768 void __kmp_middle_initialize(void) {
6769   if (__kmp_init_middle) {
6770     return;
6771   }
6772   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6773   if (__kmp_init_middle) {
6774     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6775     return;
6776   }
6777   __kmp_do_middle_initialize();
6778   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6779 }
6780 
6781 void __kmp_parallel_initialize(void) {
6782   int gtid = __kmp_entry_gtid(); // this might be a new root
6783 
6784   /* synchronize parallel initialization (for sibling) */
6785   if (TCR_4(__kmp_init_parallel))
6786     return;
6787   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6788   if (TCR_4(__kmp_init_parallel)) {
6789     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6790     return;
6791   }
6792 
6793   /* TODO reinitialization after we have already shut down */
6794   if (TCR_4(__kmp_global.g.g_done)) {
6795     KA_TRACE(
6796         10,
6797         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6798     __kmp_infinite_loop();
6799   }
6800 
6801   /* jc: The lock __kmp_initz_lock is already held, so calling
6802      __kmp_serial_initialize would cause a deadlock.  So we call
6803      __kmp_do_serial_initialize directly. */
6804   if (!__kmp_init_middle) {
6805     __kmp_do_middle_initialize();
6806   }
6807 
6808   /* begin initialization */
6809   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6810   KMP_ASSERT(KMP_UBER_GTID(gtid));
6811 
6812 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6813   // Save the FP control regs.
6814   // Worker threads will set theirs to these values at thread startup.
6815   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6816   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6817   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6818 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6819 
6820 #if KMP_OS_UNIX
6821 #if KMP_HANDLE_SIGNALS
6822   /*  must be after __kmp_serial_initialize  */
6823   __kmp_install_signals(TRUE);
6824 #endif
6825 #endif
6826 
6827   __kmp_suspend_initialize();
6828 
6829 #if defined(USE_LOAD_BALANCE)
6830   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6831     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6832   }
6833 #else
6834   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6835     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6836   }
6837 #endif
6838 
6839   if (__kmp_version) {
6840     __kmp_print_version_2();
6841   }
6842 
6843   /* we have finished parallel initialization */
6844   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6845 
6846   KMP_MB();
6847   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6848 
6849   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6850 }
6851 
6852 /* ------------------------------------------------------------------------ */
6853 
6854 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6855                                    kmp_team_t *team) {
6856   kmp_disp_t *dispatch;
6857 
6858   KMP_MB();
6859 
6860   /* none of the threads have encountered any constructs, yet. */
6861   this_thr->th.th_local.this_construct = 0;
6862 #if KMP_CACHE_MANAGE
6863   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6864 #endif /* KMP_CACHE_MANAGE */
6865   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6866   KMP_DEBUG_ASSERT(dispatch);
6867   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6868   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6869   // this_thr->th.th_info.ds.ds_tid ] );
6870 
6871   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6872 #if OMP_45_ENABLED
6873   dispatch->th_doacross_buf_idx =
6874       0; /* reset the doacross dispatch buffer counter */
6875 #endif
6876   if (__kmp_env_consistency_check)
6877     __kmp_push_parallel(gtid, team->t.t_ident);
6878 
6879   KMP_MB(); /* Flush all pending memory write invalidates.  */
6880 }
6881 
6882 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6883                                   kmp_team_t *team) {
6884   if (__kmp_env_consistency_check)
6885     __kmp_pop_parallel(gtid, team->t.t_ident);
6886 
6887   __kmp_finish_implicit_task(this_thr);
6888 }
6889 
6890 int __kmp_invoke_task_func(int gtid) {
6891   int rc;
6892   int tid = __kmp_tid_from_gtid(gtid);
6893   kmp_info_t *this_thr = __kmp_threads[gtid];
6894   kmp_team_t *team = this_thr->th.th_team;
6895 
6896   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6897 #if USE_ITT_BUILD
6898   if (__itt_stack_caller_create_ptr) {
6899     __kmp_itt_stack_callee_enter(
6900         (__itt_caller)
6901             team->t.t_stack_id); // inform ittnotify about entering user's code
6902   }
6903 #endif /* USE_ITT_BUILD */
6904 #if INCLUDE_SSC_MARKS
6905   SSC_MARK_INVOKING();
6906 #endif
6907 
6908 #if OMPT_SUPPORT
6909   void *dummy;
6910   void **exit_runtime_p;
6911   ompt_data_t *my_task_data;
6912   ompt_data_t *my_parallel_data;
6913   int ompt_team_size;
6914 
6915   if (ompt_enabled.enabled) {
6916     exit_runtime_p = &(
6917         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6918   } else {
6919     exit_runtime_p = &dummy;
6920   }
6921 
6922   my_task_data =
6923       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6924   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6925   if (ompt_enabled.ompt_callback_implicit_task) {
6926     ompt_team_size = team->t.t_nproc;
6927     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6928         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6929         __kmp_tid_from_gtid(gtid));
6930     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6931   }
6932 #endif
6933 
6934   {
6935     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6936     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6937     rc =
6938         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6939                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6940 #if OMPT_SUPPORT
6941                                ,
6942                                exit_runtime_p
6943 #endif
6944                                );
6945 #if OMPT_SUPPORT
6946     *exit_runtime_p = NULL;
6947 #endif
6948   }
6949 
6950 #if USE_ITT_BUILD
6951   if (__itt_stack_caller_create_ptr) {
6952     __kmp_itt_stack_callee_leave(
6953         (__itt_caller)
6954             team->t.t_stack_id); // inform ittnotify about leaving user's code
6955   }
6956 #endif /* USE_ITT_BUILD */
6957   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6958 
6959   return rc;
6960 }
6961 
6962 #if OMP_40_ENABLED
6963 void __kmp_teams_master(int gtid) {
6964   // This routine is called by all master threads in teams construct
6965   kmp_info_t *thr = __kmp_threads[gtid];
6966   kmp_team_t *team = thr->th.th_team;
6967   ident_t *loc = team->t.t_ident;
6968   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6969   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6970   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6971   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6972                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6973 // Launch league of teams now, but not let workers execute
6974 // (they hang on fork barrier until next parallel)
6975 #if INCLUDE_SSC_MARKS
6976   SSC_MARK_FORKING();
6977 #endif
6978   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6979                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6980                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6981 #if INCLUDE_SSC_MARKS
6982   SSC_MARK_JOINING();
6983 #endif
6984 
6985   // AC: last parameter "1" eliminates join barrier which won't work because
6986   // worker threads are in a fork barrier waiting for more parallel regions
6987   __kmp_join_call(loc, gtid
6988 #if OMPT_SUPPORT
6989                   ,
6990                   fork_context_intel
6991 #endif
6992                   ,
6993                   1);
6994 }
6995 
6996 int __kmp_invoke_teams_master(int gtid) {
6997   kmp_info_t *this_thr = __kmp_threads[gtid];
6998   kmp_team_t *team = this_thr->th.th_team;
6999 #if KMP_DEBUG
7000   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7001     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7002                      (void *)__kmp_teams_master);
7003 #endif
7004   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7005   __kmp_teams_master(gtid);
7006   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7007   return 1;
7008 }
7009 #endif /* OMP_40_ENABLED */
7010 
7011 /* this sets the requested number of threads for the next parallel region
7012    encountered by this team. since this should be enclosed in the forkjoin
7013    critical section it should avoid race conditions with assymmetrical nested
7014    parallelism */
7015 
7016 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7017   kmp_info_t *thr = __kmp_threads[gtid];
7018 
7019   if (num_threads > 0)
7020     thr->th.th_set_nproc = num_threads;
7021 }
7022 
7023 #if OMP_40_ENABLED
7024 
7025 /* this sets the requested number of teams for the teams region and/or
7026    the number of threads for the next parallel region encountered  */
7027 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7028                           int num_threads) {
7029   kmp_info_t *thr = __kmp_threads[gtid];
7030   KMP_DEBUG_ASSERT(num_teams >= 0);
7031   KMP_DEBUG_ASSERT(num_threads >= 0);
7032 
7033   if (num_teams == 0)
7034     num_teams = 1; // default number of teams is 1.
7035   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7036     if (!__kmp_reserve_warn) {
7037       __kmp_reserve_warn = 1;
7038       __kmp_msg(kmp_ms_warning,
7039                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7040                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7041     }
7042     num_teams = __kmp_teams_max_nth;
7043   }
7044   // Set number of teams (number of threads in the outer "parallel" of the
7045   // teams)
7046   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7047 
7048   // Remember the number of threads for inner parallel regions
7049   if (num_threads == 0) {
7050     if (!TCR_4(__kmp_init_middle))
7051       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7052     num_threads = __kmp_avail_proc / num_teams;
7053     if (num_teams * num_threads > __kmp_teams_max_nth) {
7054       // adjust num_threads w/o warning as it is not user setting
7055       num_threads = __kmp_teams_max_nth / num_teams;
7056     }
7057   } else {
7058     if (num_teams * num_threads > __kmp_teams_max_nth) {
7059       int new_threads = __kmp_teams_max_nth / num_teams;
7060       if (!__kmp_reserve_warn) { // user asked for too many threads
7061         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7062         __kmp_msg(kmp_ms_warning,
7063                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7064                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7065       }
7066       num_threads = new_threads;
7067     }
7068   }
7069   thr->th.th_teams_size.nth = num_threads;
7070 }
7071 
7072 // Set the proc_bind var to use in the following parallel region.
7073 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7074   kmp_info_t *thr = __kmp_threads[gtid];
7075   thr->th.th_set_proc_bind = proc_bind;
7076 }
7077 
7078 #endif /* OMP_40_ENABLED */
7079 
7080 /* Launch the worker threads into the microtask. */
7081 
7082 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7083   kmp_info_t *this_thr = __kmp_threads[gtid];
7084 
7085 #ifdef KMP_DEBUG
7086   int f;
7087 #endif /* KMP_DEBUG */
7088 
7089   KMP_DEBUG_ASSERT(team);
7090   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7091   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7092   KMP_MB(); /* Flush all pending memory write invalidates.  */
7093 
7094   team->t.t_construct = 0; /* no single directives seen yet */
7095   team->t.t_ordered.dt.t_value =
7096       0; /* thread 0 enters the ordered section first */
7097 
7098   /* Reset the identifiers on the dispatch buffer */
7099   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7100   if (team->t.t_max_nproc > 1) {
7101     int i;
7102     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7103       team->t.t_disp_buffer[i].buffer_index = i;
7104 #if OMP_45_ENABLED
7105       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7106 #endif
7107     }
7108   } else {
7109     team->t.t_disp_buffer[0].buffer_index = 0;
7110 #if OMP_45_ENABLED
7111     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7112 #endif
7113   }
7114 
7115   KMP_MB(); /* Flush all pending memory write invalidates.  */
7116   KMP_ASSERT(this_thr->th.th_team == team);
7117 
7118 #ifdef KMP_DEBUG
7119   for (f = 0; f < team->t.t_nproc; f++) {
7120     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7121                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7122   }
7123 #endif /* KMP_DEBUG */
7124 
7125   /* release the worker threads so they may begin working */
7126   __kmp_fork_barrier(gtid, 0);
7127 }
7128 
7129 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7130   kmp_info_t *this_thr = __kmp_threads[gtid];
7131 
7132   KMP_DEBUG_ASSERT(team);
7133   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7134   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7135   KMP_MB(); /* Flush all pending memory write invalidates.  */
7136 
7137 /* Join barrier after fork */
7138 
7139 #ifdef KMP_DEBUG
7140   if (__kmp_threads[gtid] &&
7141       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7142     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7143                  __kmp_threads[gtid]);
7144     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7145                  "team->t.t_nproc=%d\n",
7146                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7147                  team->t.t_nproc);
7148     __kmp_print_structure();
7149   }
7150   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7151                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7152 #endif /* KMP_DEBUG */
7153 
7154   __kmp_join_barrier(gtid); /* wait for everyone */
7155 #if OMPT_SUPPORT
7156   if (ompt_enabled.enabled &&
7157       this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7158     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7159     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7160     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7161 #if OMPT_OPTIONAL
7162     void *codeptr = NULL;
7163     if (KMP_MASTER_TID(ds_tid) &&
7164         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7165          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7166       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7167 
7168     if (ompt_enabled.ompt_callback_sync_region_wait) {
7169       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7170           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7171     }
7172     if (ompt_enabled.ompt_callback_sync_region) {
7173       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7174           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7175     }
7176 #endif
7177     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7178       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7179           ompt_scope_end, NULL, task_data, 0, ds_tid);
7180     }
7181   }
7182 #endif
7183 
7184   KMP_MB(); /* Flush all pending memory write invalidates.  */
7185   KMP_ASSERT(this_thr->th.th_team == team);
7186 }
7187 
7188 /* ------------------------------------------------------------------------ */
7189 
7190 #ifdef USE_LOAD_BALANCE
7191 
7192 // Return the worker threads actively spinning in the hot team, if we
7193 // are at the outermost level of parallelism.  Otherwise, return 0.
7194 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7195   int i;
7196   int retval;
7197   kmp_team_t *hot_team;
7198 
7199   if (root->r.r_active) {
7200     return 0;
7201   }
7202   hot_team = root->r.r_hot_team;
7203   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7204     return hot_team->t.t_nproc - 1; // Don't count master thread
7205   }
7206 
7207   // Skip the master thread - it is accounted for elsewhere.
7208   retval = 0;
7209   for (i = 1; i < hot_team->t.t_nproc; i++) {
7210     if (hot_team->t.t_threads[i]->th.th_active) {
7211       retval++;
7212     }
7213   }
7214   return retval;
7215 }
7216 
7217 // Perform an automatic adjustment to the number of
7218 // threads used by the next parallel region.
7219 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7220   int retval;
7221   int pool_active;
7222   int hot_team_active;
7223   int team_curr_active;
7224   int system_active;
7225 
7226   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7227                 set_nproc));
7228   KMP_DEBUG_ASSERT(root);
7229   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7230                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7231   KMP_DEBUG_ASSERT(set_nproc > 1);
7232 
7233   if (set_nproc == 1) {
7234     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7235     return 1;
7236   }
7237 
7238   // Threads that are active in the thread pool, active in the hot team for this
7239   // particular root (if we are at the outer par level), and the currently
7240   // executing thread (to become the master) are available to add to the new
7241   // team, but are currently contributing to the system load, and must be
7242   // accounted for.
7243   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7244   hot_team_active = __kmp_active_hot_team_nproc(root);
7245   team_curr_active = pool_active + hot_team_active + 1;
7246 
7247   // Check the system load.
7248   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7249   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7250                 "hot team active = %d\n",
7251                 system_active, pool_active, hot_team_active));
7252 
7253   if (system_active < 0) {
7254     // There was an error reading the necessary info from /proc, so use the
7255     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7256     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7257     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7258     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7259 
7260     // Make this call behave like the thread limit algorithm.
7261     retval = __kmp_avail_proc - __kmp_nth +
7262              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7263     if (retval > set_nproc) {
7264       retval = set_nproc;
7265     }
7266     if (retval < KMP_MIN_NTH) {
7267       retval = KMP_MIN_NTH;
7268     }
7269 
7270     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7271                   retval));
7272     return retval;
7273   }
7274 
7275   // There is a slight delay in the load balance algorithm in detecting new
7276   // running procs. The real system load at this instant should be at least as
7277   // large as the #active omp thread that are available to add to the team.
7278   if (system_active < team_curr_active) {
7279     system_active = team_curr_active;
7280   }
7281   retval = __kmp_avail_proc - system_active + team_curr_active;
7282   if (retval > set_nproc) {
7283     retval = set_nproc;
7284   }
7285   if (retval < KMP_MIN_NTH) {
7286     retval = KMP_MIN_NTH;
7287   }
7288 
7289   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7290   return retval;
7291 } // __kmp_load_balance_nproc()
7292 
7293 #endif /* USE_LOAD_BALANCE */
7294 
7295 /* ------------------------------------------------------------------------ */
7296 
7297 /* NOTE: this is called with the __kmp_init_lock held */
7298 void __kmp_cleanup(void) {
7299   int f;
7300 
7301   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7302 
7303   if (TCR_4(__kmp_init_parallel)) {
7304 #if KMP_HANDLE_SIGNALS
7305     __kmp_remove_signals();
7306 #endif
7307     TCW_4(__kmp_init_parallel, FALSE);
7308   }
7309 
7310   if (TCR_4(__kmp_init_middle)) {
7311 #if KMP_AFFINITY_SUPPORTED
7312     __kmp_affinity_uninitialize();
7313 #endif /* KMP_AFFINITY_SUPPORTED */
7314     __kmp_cleanup_hierarchy();
7315     TCW_4(__kmp_init_middle, FALSE);
7316   }
7317 
7318   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7319 
7320   if (__kmp_init_serial) {
7321     __kmp_runtime_destroy();
7322     __kmp_init_serial = FALSE;
7323   }
7324 
7325   __kmp_cleanup_threadprivate_caches();
7326 
7327   for (f = 0; f < __kmp_threads_capacity; f++) {
7328     if (__kmp_root[f] != NULL) {
7329       __kmp_free(__kmp_root[f]);
7330       __kmp_root[f] = NULL;
7331     }
7332   }
7333   __kmp_free(__kmp_threads);
7334   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7335   // there is no need in freeing __kmp_root.
7336   __kmp_threads = NULL;
7337   __kmp_root = NULL;
7338   __kmp_threads_capacity = 0;
7339 
7340 #if KMP_USE_DYNAMIC_LOCK
7341   __kmp_cleanup_indirect_user_locks();
7342 #else
7343   __kmp_cleanup_user_locks();
7344 #endif
7345 
7346 #if KMP_AFFINITY_SUPPORTED
7347   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7348   __kmp_cpuinfo_file = NULL;
7349 #endif /* KMP_AFFINITY_SUPPORTED */
7350 
7351 #if KMP_USE_ADAPTIVE_LOCKS
7352 #if KMP_DEBUG_ADAPTIVE_LOCKS
7353   __kmp_print_speculative_stats();
7354 #endif
7355 #endif
7356   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7357   __kmp_nested_nth.nth = NULL;
7358   __kmp_nested_nth.size = 0;
7359   __kmp_nested_nth.used = 0;
7360   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7361   __kmp_nested_proc_bind.bind_types = NULL;
7362   __kmp_nested_proc_bind.size = 0;
7363   __kmp_nested_proc_bind.used = 0;
7364 
7365   __kmp_i18n_catclose();
7366 
7367 #if KMP_STATS_ENABLED
7368   __kmp_stats_fini();
7369 #endif
7370 
7371   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7372 }
7373 
7374 /* ------------------------------------------------------------------------ */
7375 
7376 int __kmp_ignore_mppbeg(void) {
7377   char *env;
7378 
7379   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7380     if (__kmp_str_match_false(env))
7381       return FALSE;
7382   }
7383   // By default __kmpc_begin() is no-op.
7384   return TRUE;
7385 }
7386 
7387 int __kmp_ignore_mppend(void) {
7388   char *env;
7389 
7390   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7391     if (__kmp_str_match_false(env))
7392       return FALSE;
7393   }
7394   // By default __kmpc_end() is no-op.
7395   return TRUE;
7396 }
7397 
7398 void __kmp_internal_begin(void) {
7399   int gtid;
7400   kmp_root_t *root;
7401 
7402   /* this is a very important step as it will register new sibling threads
7403      and assign these new uber threads a new gtid */
7404   gtid = __kmp_entry_gtid();
7405   root = __kmp_threads[gtid]->th.th_root;
7406   KMP_ASSERT(KMP_UBER_GTID(gtid));
7407 
7408   if (root->r.r_begin)
7409     return;
7410   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7411   if (root->r.r_begin) {
7412     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7413     return;
7414   }
7415 
7416   root->r.r_begin = TRUE;
7417 
7418   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7419 }
7420 
7421 /* ------------------------------------------------------------------------ */
7422 
7423 void __kmp_user_set_library(enum library_type arg) {
7424   int gtid;
7425   kmp_root_t *root;
7426   kmp_info_t *thread;
7427 
7428   /* first, make sure we are initialized so we can get our gtid */
7429 
7430   gtid = __kmp_entry_gtid();
7431   thread = __kmp_threads[gtid];
7432 
7433   root = thread->th.th_root;
7434 
7435   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7436                 library_serial));
7437   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7438                                   thread */
7439     KMP_WARNING(SetLibraryIncorrectCall);
7440     return;
7441   }
7442 
7443   switch (arg) {
7444   case library_serial:
7445     thread->th.th_set_nproc = 0;
7446     set__nproc(thread, 1);
7447     break;
7448   case library_turnaround:
7449     thread->th.th_set_nproc = 0;
7450     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7451                                            : __kmp_dflt_team_nth_ub);
7452     break;
7453   case library_throughput:
7454     thread->th.th_set_nproc = 0;
7455     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7456                                            : __kmp_dflt_team_nth_ub);
7457     break;
7458   default:
7459     KMP_FATAL(UnknownLibraryType, arg);
7460   }
7461 
7462   __kmp_aux_set_library(arg);
7463 }
7464 
7465 void __kmp_aux_set_stacksize(size_t arg) {
7466   if (!__kmp_init_serial)
7467     __kmp_serial_initialize();
7468 
7469 #if KMP_OS_DARWIN
7470   if (arg & (0x1000 - 1)) {
7471     arg &= ~(0x1000 - 1);
7472     if (arg + 0x1000) /* check for overflow if we round up */
7473       arg += 0x1000;
7474   }
7475 #endif
7476   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7477 
7478   /* only change the default stacksize before the first parallel region */
7479   if (!TCR_4(__kmp_init_parallel)) {
7480     size_t value = arg; /* argument is in bytes */
7481 
7482     if (value < __kmp_sys_min_stksize)
7483       value = __kmp_sys_min_stksize;
7484     else if (value > KMP_MAX_STKSIZE)
7485       value = KMP_MAX_STKSIZE;
7486 
7487     __kmp_stksize = value;
7488 
7489     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7490   }
7491 
7492   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7493 }
7494 
7495 /* set the behaviour of the runtime library */
7496 /* TODO this can cause some odd behaviour with sibling parallelism... */
7497 void __kmp_aux_set_library(enum library_type arg) {
7498   __kmp_library = arg;
7499 
7500   switch (__kmp_library) {
7501   case library_serial: {
7502     KMP_INFORM(LibraryIsSerial);
7503     (void)__kmp_change_library(TRUE);
7504   } break;
7505   case library_turnaround:
7506     (void)__kmp_change_library(TRUE);
7507     break;
7508   case library_throughput:
7509     (void)__kmp_change_library(FALSE);
7510     break;
7511   default:
7512     KMP_FATAL(UnknownLibraryType, arg);
7513   }
7514 }
7515 
7516 /* ------------------------------------------------------------------------ */
7517 
7518 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7519   int blocktime = arg; /* argument is in milliseconds */
7520 #if KMP_USE_MONITOR
7521   int bt_intervals;
7522 #endif
7523   int bt_set;
7524 
7525   __kmp_save_internal_controls(thread);
7526 
7527   /* Normalize and set blocktime for the teams */
7528   if (blocktime < KMP_MIN_BLOCKTIME)
7529     blocktime = KMP_MIN_BLOCKTIME;
7530   else if (blocktime > KMP_MAX_BLOCKTIME)
7531     blocktime = KMP_MAX_BLOCKTIME;
7532 
7533   set__blocktime_team(thread->th.th_team, tid, blocktime);
7534   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7535 
7536 #if KMP_USE_MONITOR
7537   /* Calculate and set blocktime intervals for the teams */
7538   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7539 
7540   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7541   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7542 #endif
7543 
7544   /* Set whether blocktime has been set to "TRUE" */
7545   bt_set = TRUE;
7546 
7547   set__bt_set_team(thread->th.th_team, tid, bt_set);
7548   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7549 #if KMP_USE_MONITOR
7550   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7551                 "bt_intervals=%d, monitor_updates=%d\n",
7552                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7553                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7554                 __kmp_monitor_wakeups));
7555 #else
7556   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7557                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7558                 thread->th.th_team->t.t_id, tid, blocktime));
7559 #endif
7560 }
7561 
7562 void __kmp_aux_set_defaults(char const *str, int len) {
7563   if (!__kmp_init_serial) {
7564     __kmp_serial_initialize();
7565   }
7566   __kmp_env_initialize(str);
7567 
7568   if (__kmp_settings
7569 #if OMP_40_ENABLED
7570       || __kmp_display_env || __kmp_display_env_verbose
7571 #endif // OMP_40_ENABLED
7572       ) {
7573     __kmp_env_print();
7574   }
7575 } // __kmp_aux_set_defaults
7576 
7577 /* ------------------------------------------------------------------------ */
7578 /* internal fast reduction routines */
7579 
7580 PACKED_REDUCTION_METHOD_T
7581 __kmp_determine_reduction_method(
7582     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7583     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7584     kmp_critical_name *lck) {
7585 
7586   // Default reduction method: critical construct ( lck != NULL, like in current
7587   // PAROPT )
7588   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7589   // can be selected by RTL
7590   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7591   // can be selected by RTL
7592   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7593   // among generated by PAROPT.
7594 
7595   PACKED_REDUCTION_METHOD_T retval;
7596 
7597   int team_size;
7598 
7599   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7600   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7601 
7602 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7603   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7604 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7605 
7606   retval = critical_reduce_block;
7607 
7608   // another choice of getting a team size (with 1 dynamic deference) is slower
7609   team_size = __kmp_get_team_num_threads(global_tid);
7610   if (team_size == 1) {
7611 
7612     retval = empty_reduce_block;
7613 
7614   } else {
7615 
7616     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7617     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7618 
7619 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7620 
7621 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7622     KMP_OS_DARWIN
7623 
7624     int teamsize_cutoff = 4;
7625 
7626 #if KMP_MIC_SUPPORTED
7627     if (__kmp_mic_type != non_mic) {
7628       teamsize_cutoff = 8;
7629     }
7630 #endif
7631     if (tree_available) {
7632       if (team_size <= teamsize_cutoff) {
7633         if (atomic_available) {
7634           retval = atomic_reduce_block;
7635         }
7636       } else {
7637         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7638       }
7639     } else if (atomic_available) {
7640       retval = atomic_reduce_block;
7641     }
7642 #else
7643 #error "Unknown or unsupported OS"
7644 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7645 // KMP_OS_DARWIN
7646 
7647 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7648 
7649 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7650 
7651     // basic tuning
7652 
7653     if (atomic_available) {
7654       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7655         retval = atomic_reduce_block;
7656       }
7657     } // otherwise: use critical section
7658 
7659 #elif KMP_OS_DARWIN
7660 
7661     if (atomic_available && (num_vars <= 3)) {
7662       retval = atomic_reduce_block;
7663     } else if (tree_available) {
7664       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7665           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7666         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7667       }
7668     } // otherwise: use critical section
7669 
7670 #else
7671 #error "Unknown or unsupported OS"
7672 #endif
7673 
7674 #else
7675 #error "Unknown or unsupported architecture"
7676 #endif
7677   }
7678 
7679   // KMP_FORCE_REDUCTION
7680 
7681   // If the team is serialized (team_size == 1), ignore the forced reduction
7682   // method and stay with the unsynchronized method (empty_reduce_block)
7683   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7684       team_size != 1) {
7685 
7686     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7687 
7688     int atomic_available, tree_available;
7689 
7690     switch ((forced_retval = __kmp_force_reduction_method)) {
7691     case critical_reduce_block:
7692       KMP_ASSERT(lck); // lck should be != 0
7693       break;
7694 
7695     case atomic_reduce_block:
7696       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7697       if (!atomic_available) {
7698         KMP_WARNING(RedMethodNotSupported, "atomic");
7699         forced_retval = critical_reduce_block;
7700       }
7701       break;
7702 
7703     case tree_reduce_block:
7704       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7705       if (!tree_available) {
7706         KMP_WARNING(RedMethodNotSupported, "tree");
7707         forced_retval = critical_reduce_block;
7708       } else {
7709 #if KMP_FAST_REDUCTION_BARRIER
7710         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7711 #endif
7712       }
7713       break;
7714 
7715     default:
7716       KMP_ASSERT(0); // "unsupported method specified"
7717     }
7718 
7719     retval = forced_retval;
7720   }
7721 
7722   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7723 
7724 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7725 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7726 
7727   return (retval);
7728 }
7729 
7730 // this function is for testing set/get/determine reduce method
7731 kmp_int32 __kmp_get_reduce_method(void) {
7732   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7733 }
7734