1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376     ompt_lw_taskteam_t lw_taskteam;
1377     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378                             &ompt_parallel_data, codeptr);
1379 
1380     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381     // don't use lw_taskteam after linking. content was swaped
1382 
1383     /* OMPT implicit task begin */
1384     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385     if (ompt_enabled.ompt_callback_implicit_task) {
1386       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389     }
1390 
1391     /* OMPT state */
1392     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1393     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1394   }
1395 #endif
1396 }
1397 
1398 /* most of the work for a fork */
1399 /* return true if we really went parallel, false if serialized */
1400 int __kmp_fork_call(ident_t *loc, int gtid,
1401                     enum fork_context_e call_context, // Intel, GNU, ...
1402                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1403 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1404 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1405                     va_list *ap
1406 #else
1407                     va_list ap
1408 #endif
1409                     ) {
1410   void **argv;
1411   int i;
1412   int master_tid;
1413   int master_this_cons;
1414   kmp_team_t *team;
1415   kmp_team_t *parent_team;
1416   kmp_info_t *master_th;
1417   kmp_root_t *root;
1418   int nthreads;
1419   int master_active;
1420   int master_set_numthreads;
1421   int level;
1422 #if OMP_40_ENABLED
1423   int active_level;
1424   int teams_level;
1425 #endif
1426 #if KMP_NESTED_HOT_TEAMS
1427   kmp_hot_team_ptr_t **p_hot_teams;
1428 #endif
1429   { // KMP_TIME_BLOCK
1430     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1431     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1432 
1433     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1434     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1435       /* Some systems prefer the stack for the root thread(s) to start with */
1436       /* some gap from the parent stack to prevent false sharing. */
1437       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1438       /* These 2 lines below are so this does not get optimized out */
1439       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1440         __kmp_stkpadding += (short)((kmp_int64)dummy);
1441     }
1442 
1443     /* initialize if needed */
1444     KMP_DEBUG_ASSERT(
1445         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1446     if (!TCR_4(__kmp_init_parallel))
1447       __kmp_parallel_initialize();
1448 
1449     /* setup current data */
1450     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1451     // shutdown
1452     parent_team = master_th->th.th_team;
1453     master_tid = master_th->th.th_info.ds.ds_tid;
1454     master_this_cons = master_th->th.th_local.this_construct;
1455     root = master_th->th.th_root;
1456     master_active = root->r.r_active;
1457     master_set_numthreads = master_th->th.th_set_nproc;
1458 
1459 #if OMPT_SUPPORT
1460     ompt_data_t ompt_parallel_data;
1461     ompt_parallel_data.ptr = NULL;
1462     ompt_data_t *parent_task_data;
1463     ompt_frame_t *ompt_frame;
1464     ompt_data_t *implicit_task_data;
1465     void *return_address = NULL;
1466 
1467     if (ompt_enabled.enabled) {
1468       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1469                                     NULL, NULL);
1470       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1471     }
1472 #endif
1473 
1474     // Nested level will be an index in the nested nthreads array
1475     level = parent_team->t.t_level;
1476     // used to launch non-serial teams even if nested is not allowed
1477     active_level = parent_team->t.t_active_level;
1478 #if OMP_40_ENABLED
1479     // needed to check nesting inside the teams
1480     teams_level = master_th->th.th_teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483     p_hot_teams = &master_th->th.th_hot_teams;
1484     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1485       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1486           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1487       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1488       // it is either actual or not needed (when active_level > 0)
1489       (*p_hot_teams)[0].hot_team_nth = 1;
1490     }
1491 #endif
1492 
1493 #if OMPT_SUPPORT
1494     if (ompt_enabled.enabled) {
1495       if (ompt_enabled.ompt_callback_parallel_begin) {
1496         int team_size = master_set_numthreads
1497                             ? master_set_numthreads
1498                             : get__nproc_2(parent_team, master_tid);
1499         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1500             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1501             OMPT_INVOKER(call_context), return_address);
1502       }
1503       master_th->th.ompt_thread_info.state = omp_state_overhead;
1504     }
1505 #endif
1506 
1507     master_th->th.th_ident = loc;
1508 
1509 #if OMP_40_ENABLED
1510     if (master_th->th.th_teams_microtask && ap &&
1511         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1512       // AC: This is start of parallel that is nested inside teams construct.
1513       // The team is actual (hot), all workers are ready at the fork barrier.
1514       // No lock needed to initialize the team a bit, then free workers.
1515       parent_team->t.t_ident = loc;
1516       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1517       parent_team->t.t_argc = argc;
1518       argv = (void **)parent_team->t.t_argv;
1519       for (i = argc - 1; i >= 0; --i)
1520 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1521 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1522         *argv++ = va_arg(*ap, void *);
1523 #else
1524         *argv++ = va_arg(ap, void *);
1525 #endif
1526       // Increment our nested depth levels, but not increase the serialization
1527       if (parent_team == master_th->th.th_serial_team) {
1528         // AC: we are in serialized parallel
1529         __kmpc_serialized_parallel(loc, gtid);
1530         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1531         // AC: need this in order enquiry functions work
1532         // correctly, will restore at join time
1533         parent_team->t.t_serialized--;
1534 #if OMPT_SUPPORT
1535         void *dummy;
1536         void **exit_runtime_p;
1537 
1538         ompt_lw_taskteam_t lw_taskteam;
1539 
1540         if (ompt_enabled.enabled) {
1541           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1542                                   &ompt_parallel_data, return_address);
1543           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1544 
1545           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1546           // don't use lw_taskteam after linking. content was swaped
1547 
1548           /* OMPT implicit task begin */
1549           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1550           if (ompt_enabled.ompt_callback_implicit_task) {
1551             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1552                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1553                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1554           }
1555 
1556           /* OMPT state */
1557           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1558         } else {
1559           exit_runtime_p = &dummy;
1560         }
1561 #endif
1562 
1563         {
1564           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1565           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1566           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1567 #if OMPT_SUPPORT
1568                                  ,
1569                                  exit_runtime_p
1570 #endif
1571                                  );
1572         }
1573 
1574 #if OMPT_SUPPORT
1575         *exit_runtime_p = NULL;
1576         if (ompt_enabled.enabled) {
1577           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1578           if (ompt_enabled.ompt_callback_implicit_task) {
1579             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1580                 ompt_scope_end, NULL, implicit_task_data, 1,
1581                 __kmp_tid_from_gtid(gtid));
1582           }
1583           __ompt_lw_taskteam_unlink(master_th);
1584 
1585           if (ompt_enabled.ompt_callback_parallel_end) {
1586             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1587                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1588                 OMPT_INVOKER(call_context), return_address);
1589           }
1590           master_th->th.ompt_thread_info.state = omp_state_overhead;
1591         }
1592 #endif
1593         return TRUE;
1594       }
1595 
1596       parent_team->t.t_pkfn = microtask;
1597       parent_team->t.t_invoke = invoker;
1598       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1599       parent_team->t.t_active_level++;
1600       parent_team->t.t_level++;
1601 
1602       /* Change number of threads in the team if requested */
1603       if (master_set_numthreads) { // The parallel has num_threads clause
1604         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1605           // AC: only can reduce number of threads dynamically, can't increase
1606           kmp_info_t **other_threads = parent_team->t.t_threads;
1607           parent_team->t.t_nproc = master_set_numthreads;
1608           for (i = 0; i < master_set_numthreads; ++i) {
1609             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1610           }
1611           // Keep extra threads hot in the team for possible next parallels
1612         }
1613         master_th->th.th_set_nproc = 0;
1614       }
1615 
1616 #if USE_DEBUGGER
1617       if (__kmp_debugging) { // Let debugger override number of threads.
1618         int nth = __kmp_omp_num_threads(loc);
1619         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1620           master_set_numthreads = nth;
1621         }
1622       }
1623 #endif
1624 
1625       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1626                     "master_th=%p, gtid=%d\n",
1627                     root, parent_team, master_th, gtid));
1628       __kmp_internal_fork(loc, gtid, parent_team);
1629       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1630                     "master_th=%p, gtid=%d\n",
1631                     root, parent_team, master_th, gtid));
1632 
1633       /* Invoke microtask for MASTER thread */
1634       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1635                     parent_team->t.t_id, parent_team->t.t_pkfn));
1636 
1637       {
1638         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1639         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1640         if (!parent_team->t.t_invoke(gtid)) {
1641           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642         }
1643       }
1644       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1645                     parent_team->t.t_id, parent_team->t.t_pkfn));
1646       KMP_MB(); /* Flush all pending memory write invalidates.  */
1647 
1648       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1649 
1650       return TRUE;
1651     } // Parallel closely nested in teams construct
1652 #endif /* OMP_40_ENABLED */
1653 
1654 #if KMP_DEBUG
1655     if (__kmp_tasking_mode != tskm_immediate_exec) {
1656       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1657                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1658     }
1659 #endif
1660 
1661     if (parent_team->t.t_active_level >=
1662         master_th->th.th_current_task->td_icvs.max_active_levels) {
1663       nthreads = 1;
1664     } else {
1665 #if OMP_40_ENABLED
1666       int enter_teams = ((ap == NULL && active_level == 0) ||
1667                          (ap && teams_level > 0 && teams_level == level));
1668 #endif
1669       nthreads =
1670           master_set_numthreads
1671               ? master_set_numthreads
1672               : get__nproc_2(
1673                     parent_team,
1674                     master_tid); // TODO: get nproc directly from current task
1675 
1676       // Check if we need to take forkjoin lock? (no need for serialized
1677       // parallel out of teams construct). This code moved here from
1678       // __kmp_reserve_threads() to speedup nested serialized parallels.
1679       if (nthreads > 1) {
1680         if ((!get__nested(master_th) && (root->r.r_in_parallel
1681 #if OMP_40_ENABLED
1682                                          && !enter_teams
1683 #endif /* OMP_40_ENABLED */
1684                                          )) ||
1685             (__kmp_library == library_serial)) {
1686           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1687                         " threads\n",
1688                         gtid, nthreads));
1689           nthreads = 1;
1690         }
1691       }
1692       if (nthreads > 1) {
1693         /* determine how many new threads we can use */
1694         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1695         nthreads = __kmp_reserve_threads(
1696             root, parent_team, master_tid, nthreads
1697 #if OMP_40_ENABLED
1698             /* AC: If we execute teams from parallel region (on host), then
1699                teams should be created but each can only have 1 thread if
1700                nesting is disabled. If teams called from serial region, then
1701                teams and their threads should be created regardless of the
1702                nesting setting. */
1703             ,
1704             enter_teams
1705 #endif /* OMP_40_ENABLED */
1706             );
1707         if (nthreads == 1) {
1708           // Free lock for single thread execution here; for multi-thread
1709           // execution it will be freed later after team of threads created
1710           // and initialized
1711           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1712         }
1713       }
1714     }
1715     KMP_DEBUG_ASSERT(nthreads > 0);
1716 
1717     // If we temporarily changed the set number of threads then restore it now
1718     master_th->th.th_set_nproc = 0;
1719 
1720     /* create a serialized parallel region? */
1721     if (nthreads == 1) {
1722 /* josh todo: hypothetical question: what do we do for OS X*? */
1723 #if KMP_OS_LINUX &&                                                            \
1724     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1725       void *args[argc];
1726 #else
1727       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1728 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1729           KMP_ARCH_AARCH64) */
1730 
1731       KA_TRACE(20,
1732                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1733 
1734       __kmpc_serialized_parallel(loc, gtid);
1735 
1736       if (call_context == fork_context_intel) {
1737         /* TODO this sucks, use the compiler itself to pass args! :) */
1738         master_th->th.th_serial_team->t.t_ident = loc;
1739 #if OMP_40_ENABLED
1740         if (!ap) {
1741           // revert change made in __kmpc_serialized_parallel()
1742           master_th->th.th_serial_team->t.t_level--;
1743 // Get args from parent team for teams construct
1744 
1745 #if OMPT_SUPPORT
1746           void *dummy;
1747           void **exit_runtime_p;
1748           ompt_task_info_t *task_info;
1749 
1750           ompt_lw_taskteam_t lw_taskteam;
1751 
1752           if (ompt_enabled.enabled) {
1753             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1754                                     &ompt_parallel_data, return_address);
1755 
1756             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1757             // don't use lw_taskteam after linking. content was swaped
1758 
1759             task_info = OMPT_CUR_TASK_INFO(master_th);
1760             exit_runtime_p = &(task_info->frame.exit_frame);
1761             if (ompt_enabled.ompt_callback_implicit_task) {
1762               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1764                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1765             }
1766 
1767             /* OMPT state */
1768             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1769           } else {
1770             exit_runtime_p = &dummy;
1771           }
1772 #endif
1773 
1774           {
1775             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1776             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1777             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1778                                    parent_team->t.t_argv
1779 #if OMPT_SUPPORT
1780                                    ,
1781                                    exit_runtime_p
1782 #endif
1783                                    );
1784           }
1785 
1786 #if OMPT_SUPPORT
1787           if (ompt_enabled.enabled) {
1788             exit_runtime_p = NULL;
1789             if (ompt_enabled.ompt_callback_implicit_task) {
1790               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1792                   __kmp_tid_from_gtid(gtid));
1793             }
1794 
1795             __ompt_lw_taskteam_unlink(master_th);
1796             if (ompt_enabled.ompt_callback_parallel_end) {
1797               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1798                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1799                   OMPT_INVOKER(call_context), return_address);
1800             }
1801             master_th->th.ompt_thread_info.state = omp_state_overhead;
1802           }
1803 #endif
1804         } else if (microtask == (microtask_t)__kmp_teams_master) {
1805           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1806                            master_th->th.th_serial_team);
1807           team = master_th->th.th_team;
1808           // team->t.t_pkfn = microtask;
1809           team->t.t_invoke = invoker;
1810           __kmp_alloc_argv_entries(argc, team, TRUE);
1811           team->t.t_argc = argc;
1812           argv = (void **)team->t.t_argv;
1813           if (ap) {
1814             for (i = argc - 1; i >= 0; --i)
1815 // TODO: revert workaround for Intel(R) 64 tracker #96
1816 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1817               *argv++ = va_arg(*ap, void *);
1818 #else
1819               *argv++ = va_arg(ap, void *);
1820 #endif
1821           } else {
1822             for (i = 0; i < argc; ++i)
1823               // Get args from parent team for teams construct
1824               argv[i] = parent_team->t.t_argv[i];
1825           }
1826           // AC: revert change made in __kmpc_serialized_parallel()
1827           //     because initial code in teams should have level=0
1828           team->t.t_level--;
1829           // AC: call special invoker for outer "parallel" of teams construct
1830           {
1831             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1832             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1833             invoker(gtid);
1834           }
1835         } else {
1836 #endif /* OMP_40_ENABLED */
1837           argv = args;
1838           for (i = argc - 1; i >= 0; --i)
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841             *argv++ = va_arg(*ap, void *);
1842 #else
1843           *argv++ = va_arg(ap, void *);
1844 #endif
1845           KMP_MB();
1846 
1847 #if OMPT_SUPPORT
1848           void *dummy;
1849           void **exit_runtime_p;
1850           ompt_task_info_t *task_info;
1851 
1852           ompt_lw_taskteam_t lw_taskteam;
1853 
1854           if (ompt_enabled.enabled) {
1855             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1856                                     &ompt_parallel_data, return_address);
1857             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1858             // don't use lw_taskteam after linking. content was swaped
1859             task_info = OMPT_CUR_TASK_INFO(master_th);
1860             exit_runtime_p = &(task_info->frame.exit_frame);
1861 
1862             /* OMPT implicit task begin */
1863             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1864             if (ompt_enabled.ompt_callback_implicit_task) {
1865               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1866                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1867                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1868             }
1869 
1870             /* OMPT state */
1871             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1872           } else {
1873             exit_runtime_p = &dummy;
1874           }
1875 #endif
1876 
1877           {
1878             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882                                    ,
1883                                    exit_runtime_p
1884 #endif
1885                                    );
1886           }
1887 
1888 #if OMPT_SUPPORT
1889           if (ompt_enabled.enabled) {
1890             *exit_runtime_p = NULL;
1891             if (ompt_enabled.ompt_callback_implicit_task) {
1892               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1894                   __kmp_tid_from_gtid(gtid));
1895             }
1896 
1897             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898             __ompt_lw_taskteam_unlink(master_th);
1899             if (ompt_enabled.ompt_callback_parallel_end) {
1900               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901                   &ompt_parallel_data, parent_task_data,
1902                   OMPT_INVOKER(call_context), return_address);
1903             }
1904             master_th->th.ompt_thread_info.state = omp_state_overhead;
1905           }
1906 #endif
1907 #if OMP_40_ENABLED
1908         }
1909 #endif /* OMP_40_ENABLED */
1910       } else if (call_context == fork_context_gnu) {
1911 #if OMPT_SUPPORT
1912         ompt_lw_taskteam_t lwt;
1913         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1914                                 return_address);
1915 
1916         lwt.ompt_task_info.frame.exit_frame = NULL;
1917         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921         // we were called from GNU native code
1922         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1923         return FALSE;
1924       } else {
1925         KMP_ASSERT2(call_context < fork_context_last,
1926                     "__kmp_fork_call: unknown fork_context parameter");
1927       }
1928 
1929       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930       KMP_MB();
1931       return FALSE;
1932     }
1933 
1934     // GEH: only modify the executing flag in the case when not serialized
1935     //      serialized case is handled in kmpc_serialized_parallel
1936     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1937                   "curtask=%p, curtask_max_aclevel=%d\n",
1938                   parent_team->t.t_active_level, master_th,
1939                   master_th->th.th_current_task,
1940                   master_th->th.th_current_task->td_icvs.max_active_levels));
1941     // TODO: GEH - cannot do this assertion because root thread not set up as
1942     // executing
1943     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1944     master_th->th.th_current_task->td_flags.executing = 0;
1945 
1946 #if OMP_40_ENABLED
1947     if (!master_th->th.th_teams_microtask || level > teams_level)
1948 #endif /* OMP_40_ENABLED */
1949     {
1950       /* Increment our nested depth level */
1951       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1952     }
1953 
1954     // See if we need to make a copy of the ICVs.
1955     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1956     if ((level + 1 < __kmp_nested_nth.used) &&
1957         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1958       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1959     } else {
1960       nthreads_icv = 0; // don't update
1961     }
1962 
1963 #if OMP_40_ENABLED
1964     // Figure out the proc_bind_policy for the new team.
1965     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1966     kmp_proc_bind_t proc_bind_icv =
1967         proc_bind_default; // proc_bind_default means don't update
1968     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1969       proc_bind = proc_bind_false;
1970     } else {
1971       if (proc_bind == proc_bind_default) {
1972         // No proc_bind clause specified; use current proc-bind-var for this
1973         // parallel region
1974         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1975       }
1976       /* else: The proc_bind policy was specified explicitly on parallel clause.
1977          This overrides proc-bind-var for this parallel region, but does not
1978          change proc-bind-var. */
1979       // Figure the value of proc-bind-var for the child threads.
1980       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1981           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1982            master_th->th.th_current_task->td_icvs.proc_bind)) {
1983         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1984       }
1985     }
1986 
1987     // Reset for next parallel region
1988     master_th->th.th_set_proc_bind = proc_bind_default;
1989 #endif /* OMP_40_ENABLED */
1990 
1991     if ((nthreads_icv > 0)
1992 #if OMP_40_ENABLED
1993         || (proc_bind_icv != proc_bind_default)
1994 #endif /* OMP_40_ENABLED */
1995             ) {
1996       kmp_internal_control_t new_icvs;
1997       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1998       new_icvs.next = NULL;
1999       if (nthreads_icv > 0) {
2000         new_icvs.nproc = nthreads_icv;
2001       }
2002 
2003 #if OMP_40_ENABLED
2004       if (proc_bind_icv != proc_bind_default) {
2005         new_icvs.proc_bind = proc_bind_icv;
2006       }
2007 #endif /* OMP_40_ENABLED */
2008 
2009       /* allocate a new parallel team */
2010       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011       team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013                                  ompt_parallel_data,
2014 #endif
2015 #if OMP_40_ENABLED
2016                                  proc_bind,
2017 #endif
2018                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2019     } else {
2020       /* allocate a new parallel team */
2021       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2022       team = __kmp_allocate_team(root, nthreads, nthreads,
2023 #if OMPT_SUPPORT
2024                                  ompt_parallel_data,
2025 #endif
2026 #if OMP_40_ENABLED
2027                                  proc_bind,
2028 #endif
2029                                  &master_th->th.th_current_task->td_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     }
2032     KF_TRACE(
2033         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2034 
2035     /* setup the new team */
2036     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2037     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2038     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2039     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2040     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2041 #if OMPT_SUPPORT
2042     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2043                           return_address);
2044 #endif
2045     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2046 // TODO: parent_team->t.t_level == INT_MAX ???
2047 #if OMP_40_ENABLED
2048     if (!master_th->th.th_teams_microtask || level > teams_level) {
2049 #endif /* OMP_40_ENABLED */
2050       int new_level = parent_team->t.t_level + 1;
2051       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052       new_level = parent_team->t.t_active_level + 1;
2053       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054 #if OMP_40_ENABLED
2055     } else {
2056       // AC: Do not increase parallel level at start of the teams construct
2057       int new_level = parent_team->t.t_level;
2058       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2059       new_level = parent_team->t.t_active_level;
2060       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2061     }
2062 #endif /* OMP_40_ENABLED */
2063     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2064     // set master's schedule as new run-time schedule
2065     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2066 
2067 #if OMP_40_ENABLED
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069 #endif
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2113           // Restore master's nested state if nested hot team
2114           master_th->th.th_task_state =
2115               master_th->th
2116                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2117         } else {
2118 #endif
2119           master_th->th.th_task_state = 0;
2120 #if KMP_NESTED_HOT_TEAMS
2121         }
2122 #endif
2123       }
2124 #if !KMP_NESTED_HOT_TEAMS
2125       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2126                        (team == root->r.r_hot_team));
2127 #endif
2128     }
2129 
2130     KA_TRACE(
2131         20,
2132         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2133          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2134          team->t.t_nproc));
2135     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2136                      (team->t.t_master_tid == 0 &&
2137                       (team->t.t_parent == root->r.r_root_team ||
2138                        team->t.t_parent->t.t_serialized)));
2139     KMP_MB();
2140 
2141     /* now, setup the arguments */
2142     argv = (void **)team->t.t_argv;
2143 #if OMP_40_ENABLED
2144     if (ap) {
2145 #endif /* OMP_40_ENABLED */
2146       for (i = argc - 1; i >= 0; --i) {
2147 // TODO: revert workaround for Intel(R) 64 tracker #96
2148 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2149         void *new_argv = va_arg(*ap, void *);
2150 #else
2151       void *new_argv = va_arg(ap, void *);
2152 #endif
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156 #if OMP_40_ENABLED
2157     } else {
2158       for (i = 0; i < argc; ++i) {
2159         // Get args from parent team for teams construct
2160         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2161       }
2162     }
2163 #endif /* OMP_40_ENABLED */
2164 
2165     /* now actually fork the threads */
2166     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2167     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2168       root->r.r_active = TRUE;
2169 
2170     __kmp_fork_team_threads(root, team, master_th, gtid);
2171     __kmp_setup_icv_copy(team, nthreads,
2172                          &master_th->th.th_current_task->td_icvs, loc);
2173 
2174 #if OMPT_SUPPORT
2175     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2176 #endif
2177 
2178     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2179 
2180 #if USE_ITT_BUILD
2181     if (team->t.t_active_level == 1 // only report frames at level 1
2182 #if OMP_40_ENABLED
2183         && !master_th->th.th_teams_microtask // not in teams construct
2184 #endif /* OMP_40_ENABLED */
2185         ) {
2186 #if USE_ITT_NOTIFY
2187       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2188           (__kmp_forkjoin_frames_mode == 3 ||
2189            __kmp_forkjoin_frames_mode == 1)) {
2190         kmp_uint64 tmp_time = 0;
2191         if (__itt_get_timestamp_ptr)
2192           tmp_time = __itt_get_timestamp();
2193         // Internal fork - report frame begin
2194         master_th->th.th_frame_time = tmp_time;
2195         if (__kmp_forkjoin_frames_mode == 3)
2196           team->t.t_region_time = tmp_time;
2197       } else
2198 // only one notification scheme (either "submit" or "forking/joined", not both)
2199 #endif /* USE_ITT_NOTIFY */
2200           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2201               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2202         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2203         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2204       }
2205     }
2206 #endif /* USE_ITT_BUILD */
2207 
2208     /* now go on and do the work */
2209     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2210     KMP_MB();
2211     KF_TRACE(10,
2212              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2213               root, team, master_th, gtid));
2214 
2215 #if USE_ITT_BUILD
2216     if (__itt_stack_caller_create_ptr) {
2217       team->t.t_stack_id =
2218           __kmp_itt_stack_caller_create(); // create new stack stitching id
2219       // before entering fork barrier
2220     }
2221 #endif /* USE_ITT_BUILD */
2222 
2223 #if OMP_40_ENABLED
2224     // AC: skip __kmp_internal_fork at teams construct, let only master
2225     // threads execute
2226     if (ap)
2227 #endif /* OMP_40_ENABLED */
2228     {
2229       __kmp_internal_fork(loc, gtid, team);
2230       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2231                     "master_th=%p, gtid=%d\n",
2232                     root, team, master_th, gtid));
2233     }
2234 
2235     if (call_context == fork_context_gnu) {
2236       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2237       return TRUE;
2238     }
2239 
2240     /* Invoke microtask for MASTER thread */
2241     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2242                   team->t.t_id, team->t.t_pkfn));
2243   } // END of timer KMP_fork_call block
2244 
2245   {
2246     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2247     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2248     if (!team->t.t_invoke(gtid)) {
2249       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2250     }
2251   }
2252   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253                 team->t.t_id, team->t.t_pkfn));
2254   KMP_MB(); /* Flush all pending memory write invalidates.  */
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259   if (ompt_enabled.enabled) {
2260     master_th->th.ompt_thread_info.state = omp_state_overhead;
2261   }
2262 #endif
2263 
2264   return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269                                             kmp_team_t *team) {
2270   // restore state outside the region
2271   thread->th.ompt_thread_info.state =
2272       ((team->t.t_serialized) ? omp_state_work_serial
2273                               : omp_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277                                    kmp_team_t *team, ompt_data_t *parallel_data,
2278                                    fork_context_e fork_context, void *codeptr) {
2279   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280   if (ompt_enabled.ompt_callback_parallel_end) {
2281     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2283         codeptr);
2284   }
2285 
2286   task_info->frame.enter_frame = NULL;
2287   __kmp_join_restore_state(thread, team);
2288 }
2289 #endif
2290 
2291 void __kmp_join_call(ident_t *loc, int gtid
2292 #if OMPT_SUPPORT
2293                      ,
2294                      enum fork_context_e fork_context
2295 #endif
2296 #if OMP_40_ENABLED
2297                      ,
2298                      int exit_teams
2299 #endif /* OMP_40_ENABLED */
2300                      ) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307   int i;
2308 
2309   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2310 
2311   /* setup current data */
2312   master_th = __kmp_threads[gtid];
2313   root = master_th->th.th_root;
2314   team = master_th->th.th_team;
2315   parent_team = team->t.t_parent;
2316 
2317   master_th->th.th_ident = loc;
2318 
2319 #if OMPT_SUPPORT
2320   if (ompt_enabled.enabled) {
2321     master_th->th.ompt_thread_info.state = omp_state_overhead;
2322   }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328                   "th_task_team = %p\n",
2329                   __kmp_gtid_from_thread(master_th), team,
2330                   team->t.t_task_team[master_th->th.th_task_state],
2331                   master_th->th.th_task_team));
2332     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333                      team->t.t_task_team[master_th->th.th_task_state]);
2334   }
2335 #endif
2336 
2337   if (team->t.t_serialized) {
2338 #if OMP_40_ENABLED
2339     if (master_th->th.th_teams_microtask) {
2340       // We are in teams construct
2341       int level = team->t.t_level;
2342       int tlevel = master_th->th.th_teams_level;
2343       if (level == tlevel) {
2344         // AC: we haven't incremented it earlier at start of teams construct,
2345         //     so do it here - at the end of teams construct
2346         team->t.t_level++;
2347       } else if (level == tlevel + 1) {
2348         // AC: we are exiting parallel inside teams, need to increment
2349         // serialization in order to restore it in the next call to
2350         // __kmpc_end_serialized_parallel
2351         team->t.t_serialized++;
2352       }
2353     }
2354 #endif /* OMP_40_ENABLED */
2355     __kmpc_end_serialized_parallel(loc, gtid);
2356 
2357 #if OMPT_SUPPORT
2358     if (ompt_enabled.enabled) {
2359       __kmp_join_restore_state(master_th, parent_team);
2360     }
2361 #endif
2362 
2363     return;
2364   }
2365 
2366   master_active = team->t.t_master_active;
2367 
2368 #if OMP_40_ENABLED
2369   if (!exit_teams)
2370 #endif /* OMP_40_ENABLED */
2371   {
2372     // AC: No barrier for internal teams at exit from teams construct.
2373     //     But there is barrier for external team (league).
2374     __kmp_internal_join(loc, gtid, team);
2375   }
2376 #if OMP_40_ENABLED
2377   else {
2378     master_th->th.th_task_state =
2379         0; // AC: no tasking in teams (out of any parallel)
2380   }
2381 #endif /* OMP_40_ENABLED */
2382 
2383   KMP_MB();
2384 
2385 #if OMPT_SUPPORT
2386   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2387   void *codeptr = team->t.ompt_team_info.master_return_address;
2388 #endif
2389 
2390 #if USE_ITT_BUILD
2391   if (__itt_stack_caller_create_ptr) {
2392     __kmp_itt_stack_caller_destroy(
2393         (__itt_caller)team->t
2394             .t_stack_id); // destroy the stack stitching id after join barrier
2395   }
2396 
2397   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2398   if (team->t.t_active_level == 1
2399 #if OMP_40_ENABLED
2400       && !master_th->th.th_teams_microtask /* not in teams construct */
2401 #endif /* OMP_40_ENABLED */
2402       ) {
2403     master_th->th.th_ident = loc;
2404     // only one notification scheme (either "submit" or "forking/joined", not
2405     // both)
2406     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2407         __kmp_forkjoin_frames_mode == 3)
2408       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2409                              master_th->th.th_frame_time, 0, loc,
2410                              master_th->th.th_team_nproc, 1);
2411     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2412              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2413       __kmp_itt_region_joined(gtid);
2414   } // active_level == 1
2415 #endif /* USE_ITT_BUILD */
2416 
2417 #if OMP_40_ENABLED
2418   if (master_th->th.th_teams_microtask && !exit_teams &&
2419       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2420       team->t.t_level == master_th->th.th_teams_level + 1) {
2421     // AC: We need to leave the team structure intact at the end of parallel
2422     // inside the teams construct, so that at the next parallel same (hot) team
2423     // works, only adjust nesting levels
2424 
2425     /* Decrement our nested depth level */
2426     team->t.t_level--;
2427     team->t.t_active_level--;
2428     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2429 
2430     /* Restore number of threads in the team if needed */
2431     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2432       int old_num = master_th->th.th_team_nproc;
2433       int new_num = master_th->th.th_teams_size.nth;
2434       kmp_info_t **other_threads = team->t.t_threads;
2435       team->t.t_nproc = new_num;
2436       for (i = 0; i < old_num; ++i) {
2437         other_threads[i]->th.th_team_nproc = new_num;
2438       }
2439       // Adjust states of non-used threads of the team
2440       for (i = old_num; i < new_num; ++i) {
2441         // Re-initialize thread's barrier data.
2442         int b;
2443         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2444         for (b = 0; b < bs_last_barrier; ++b) {
2445           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2446           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2447 #if USE_DEBUGGER
2448           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2449 #endif
2450         }
2451         if (__kmp_tasking_mode != tskm_immediate_exec) {
2452           // Synchronize thread's task state
2453           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2454         }
2455       }
2456     }
2457 
2458 #if OMPT_SUPPORT
2459     if (ompt_enabled.enabled) {
2460       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2461                       codeptr);
2462     }
2463 #endif
2464 
2465     return;
2466   }
2467 #endif /* OMP_40_ENABLED */
2468 
2469   /* do cleanup and restore the parent team */
2470   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2471   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2472 
2473   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2474 
2475   /* jc: The following lock has instructions with REL and ACQ semantics,
2476      separating the parallel user code called in this parallel region
2477      from the serial user code called after this function returns. */
2478   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2479 
2480 #if OMP_40_ENABLED
2481   if (!master_th->th.th_teams_microtask ||
2482       team->t.t_level > master_th->th.th_teams_level)
2483 #endif /* OMP_40_ENABLED */
2484   {
2485     /* Decrement our nested depth level */
2486     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int ompt_team_size = team->t.t_nproc;
2495       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2496           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2497           __kmp_tid_from_gtid(gtid));
2498     }
2499 
2500     task_info->frame.exit_frame = NULL;
2501     task_info->task_data = ompt_data_none;
2502   }
2503 #endif
2504 
2505   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2506                 master_th, team));
2507   __kmp_pop_current_task_from_thread(master_th);
2508 
2509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2510   // Restore master thread's partition.
2511   master_th->th.th_first_place = team->t.t_first_place;
2512   master_th->th.th_last_place = team->t.t_last_place;
2513 #endif /* OMP_40_ENABLED */
2514 
2515   updateHWFPControl(team);
2516 
2517   if (root->r.r_active != master_active)
2518     root->r.r_active = master_active;
2519 
2520   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2521                             master_th)); // this will free worker threads
2522 
2523   /* this race was fun to find. make sure the following is in the critical
2524      region otherwise assertions may fail occasionally since the old team may be
2525      reallocated and the hierarchy appears inconsistent. it is actually safe to
2526      run and won't cause any bugs, but will cause those assertion failures. it's
2527      only one deref&assign so might as well put this in the critical region */
2528   master_th->th.th_team = parent_team;
2529   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2530   master_th->th.th_team_master = parent_team->t.t_threads[0];
2531   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2532 
2533   /* restore serialized team, if need be */
2534   if (parent_team->t.t_serialized &&
2535       parent_team != master_th->th.th_serial_team &&
2536       parent_team != root->r.r_root_team) {
2537     __kmp_free_team(root,
2538                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2539     master_th->th.th_serial_team = parent_team;
2540   }
2541 
2542   if (__kmp_tasking_mode != tskm_immediate_exec) {
2543     if (master_th->th.th_task_state_top >
2544         0) { // Restore task state from memo stack
2545       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2546       // Remember master's state if we re-use this nested hot team
2547       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2548           master_th->th.th_task_state;
2549       --master_th->th.th_task_state_top; // pop
2550       // Now restore state at this level
2551       master_th->th.th_task_state =
2552           master_th->th
2553               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2554     }
2555     // Copy the task team from the parent team to the master thread
2556     master_th->th.th_task_team =
2557         parent_team->t.t_task_team[master_th->th.th_task_state];
2558     KA_TRACE(20,
2559              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2560               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2561               parent_team));
2562   }
2563 
2564   // TODO: GEH - cannot do this assertion because root thread not set up as
2565   // executing
2566   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2567   master_th->th.th_current_task->td_flags.executing = 1;
2568 
2569   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2570 
2571 #if OMPT_SUPPORT
2572   if (ompt_enabled.enabled) {
2573     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2574                     codeptr);
2575   }
2576 #endif
2577 
2578   KMP_MB();
2579   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2580 }
2581 
2582 /* Check whether we should push an internal control record onto the
2583    serial team stack.  If so, do it.  */
2584 void __kmp_save_internal_controls(kmp_info_t *thread) {
2585 
2586   if (thread->th.th_team != thread->th.th_serial_team) {
2587     return;
2588   }
2589   if (thread->th.th_team->t.t_serialized > 1) {
2590     int push = 0;
2591 
2592     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2593       push = 1;
2594     } else {
2595       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2596           thread->th.th_team->t.t_serialized) {
2597         push = 1;
2598       }
2599     }
2600     if (push) { /* push a record on the serial team's stack */
2601       kmp_internal_control_t *control =
2602           (kmp_internal_control_t *)__kmp_allocate(
2603               sizeof(kmp_internal_control_t));
2604 
2605       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2606 
2607       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2608 
2609       control->next = thread->th.th_team->t.t_control_stack_top;
2610       thread->th.th_team->t.t_control_stack_top = control;
2611     }
2612   }
2613 }
2614 
2615 /* Changes set_nproc */
2616 void __kmp_set_num_threads(int new_nth, int gtid) {
2617   kmp_info_t *thread;
2618   kmp_root_t *root;
2619 
2620   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2621   KMP_DEBUG_ASSERT(__kmp_init_serial);
2622 
2623   if (new_nth < 1)
2624     new_nth = 1;
2625   else if (new_nth > __kmp_max_nth)
2626     new_nth = __kmp_max_nth;
2627 
2628   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2629   thread = __kmp_threads[gtid];
2630 
2631   __kmp_save_internal_controls(thread);
2632 
2633   set__nproc(thread, new_nth);
2634 
2635   // If this omp_set_num_threads() call will cause the hot team size to be
2636   // reduced (in the absence of a num_threads clause), then reduce it now,
2637   // rather than waiting for the next parallel region.
2638   root = thread->th.th_root;
2639   if (__kmp_init_parallel && (!root->r.r_active) &&
2640       (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644       ) {
2645     kmp_team_t *hot_team = root->r.r_hot_team;
2646     int f;
2647 
2648     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650     // Release the extra threads we don't need any more.
2651     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653       if (__kmp_tasking_mode != tskm_immediate_exec) {
2654         // When decreasing team size, threads no longer in the team should unref
2655         // task team.
2656         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657       }
2658       __kmp_free_thread(hot_team->t.t_threads[f]);
2659       hot_team->t.t_threads[f] = NULL;
2660     }
2661     hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663     if (thread->th.th_hot_teams) {
2664       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666     }
2667 #endif
2668 
2669     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671     // Update the t_nproc field in the threads that are still active.
2672     for (f = 0; f < new_nth; f++) {
2673       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675     }
2676     // Special flag in case omp_set_num_threads() call
2677     hot_team->t.t_size_changed = -1;
2678   }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683   kmp_info_t *thread;
2684 
2685   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686                 "%d = (%d)\n",
2687                 gtid, max_active_levels));
2688   KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690   // validate max_active_levels
2691   if (max_active_levels < 0) {
2692     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693     // We ignore this call if the user has specified a negative value.
2694     // The current setting won't be changed. The last valid setting will be
2695     // used. A warning will be issued (if warnings are allowed as controlled by
2696     // the KMP_WARNINGS env var).
2697     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698                   "max_active_levels for thread %d = (%d)\n",
2699                   gtid, max_active_levels));
2700     return;
2701   }
2702   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703     // it's OK, the max_active_levels is within the valid range: [ 0;
2704     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705     // We allow a zero value. (implementation defined behavior)
2706   } else {
2707     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710     // Current upper limit is MAX_INT. (implementation defined behavior)
2711     // If the input exceeds the upper limit, we correct the input to be the
2712     // upper limit. (implementation defined behavior)
2713     // Actually, the flow should never get here until we use MAX_INT limit.
2714   }
2715   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716                 "max_active_levels for thread %d = (%d)\n",
2717                 gtid, max_active_levels));
2718 
2719   thread = __kmp_threads[gtid];
2720 
2721   __kmp_save_internal_controls(thread);
2722 
2723   set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728   kmp_info_t *thread;
2729 
2730   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731   KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733   thread = __kmp_threads[gtid];
2734   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736                 "curtask_maxaclevel=%d\n",
2737                 gtid, thread->th.th_current_task,
2738                 thread->th.th_current_task->td_icvs.max_active_levels));
2739   return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744   kmp_info_t *thread;
2745   //    kmp_team_t *team;
2746 
2747   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2748                 gtid, (int)kind, chunk));
2749   KMP_DEBUG_ASSERT(__kmp_init_serial);
2750 
2751   // Check if the kind parameter is valid, correct if needed.
2752   // Valid parameters should fit in one of two intervals - standard or extended:
2753   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2754   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2755   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2756       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2757     // TODO: Hint needs attention in case we change the default schedule.
2758     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2759               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2760               __kmp_msg_null);
2761     kind = kmp_sched_default;
2762     chunk = 0; // ignore chunk value in case of bad kind
2763   }
2764 
2765   thread = __kmp_threads[gtid];
2766 
2767   __kmp_save_internal_controls(thread);
2768 
2769   if (kind < kmp_sched_upper_std) {
2770     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2771       // differ static chunked vs. unchunked:  chunk should be invalid to
2772       // indicate unchunked schedule (which is the default)
2773       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2774     } else {
2775       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2776           __kmp_sch_map[kind - kmp_sched_lower - 1];
2777     }
2778   } else {
2779     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2780     //    kmp_sched_lower - 2 ];
2781     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2782         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2783                       kmp_sched_lower - 2];
2784   }
2785   if (kind == kmp_sched_auto || chunk < 1) {
2786     // ignore parameter chunk for schedule auto
2787     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2788   } else {
2789     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2790   }
2791 }
2792 
2793 /* Gets def_sched_var ICV values */
2794 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2795   kmp_info_t *thread;
2796   enum sched_type th_type;
2797 
2798   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2799   KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801   thread = __kmp_threads[gtid];
2802 
2803   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2804 
2805   switch (th_type) {
2806   case kmp_sch_static:
2807   case kmp_sch_static_greedy:
2808   case kmp_sch_static_balanced:
2809     *kind = kmp_sched_static;
2810     *chunk = 0; // chunk was not set, try to show this fact via zero value
2811     return;
2812   case kmp_sch_static_chunked:
2813     *kind = kmp_sched_static;
2814     break;
2815   case kmp_sch_dynamic_chunked:
2816     *kind = kmp_sched_dynamic;
2817     break;
2818   case kmp_sch_guided_chunked:
2819   case kmp_sch_guided_iterative_chunked:
2820   case kmp_sch_guided_analytical_chunked:
2821     *kind = kmp_sched_guided;
2822     break;
2823   case kmp_sch_auto:
2824     *kind = kmp_sched_auto;
2825     break;
2826   case kmp_sch_trapezoidal:
2827     *kind = kmp_sched_trapezoidal;
2828     break;
2829 #if KMP_STATIC_STEAL_ENABLED
2830   case kmp_sch_static_steal:
2831     *kind = kmp_sched_static_steal;
2832     break;
2833 #endif
2834   default:
2835     KMP_FATAL(UnknownSchedulingType, th_type);
2836   }
2837 
2838   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2839 }
2840 
2841 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2842 
2843   int ii, dd;
2844   kmp_team_t *team;
2845   kmp_info_t *thr;
2846 
2847   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2848   KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850   // validate level
2851   if (level == 0)
2852     return 0;
2853   if (level < 0)
2854     return -1;
2855   thr = __kmp_threads[gtid];
2856   team = thr->th.th_team;
2857   ii = team->t.t_level;
2858   if (level > ii)
2859     return -1;
2860 
2861 #if OMP_40_ENABLED
2862   if (thr->th.th_teams_microtask) {
2863     // AC: we are in teams region where multiple nested teams have same level
2864     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2865     if (level <=
2866         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2867       KMP_DEBUG_ASSERT(ii >= tlevel);
2868       // AC: As we need to pass by the teams league, we need to artificially
2869       // increase ii
2870       if (ii == tlevel) {
2871         ii += 2; // three teams have same level
2872       } else {
2873         ii++; // two teams have same level
2874       }
2875     }
2876   }
2877 #endif
2878 
2879   if (ii == level)
2880     return __kmp_tid_from_gtid(gtid);
2881 
2882   dd = team->t.t_serialized;
2883   level++;
2884   while (ii > level) {
2885     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2886     }
2887     if ((team->t.t_serialized) && (!dd)) {
2888       team = team->t.t_parent;
2889       continue;
2890     }
2891     if (ii > level) {
2892       team = team->t.t_parent;
2893       dd = team->t.t_serialized;
2894       ii--;
2895     }
2896   }
2897 
2898   return (dd > 1) ? (0) : (team->t.t_master_tid);
2899 }
2900 
2901 int __kmp_get_team_size(int gtid, int level) {
2902 
2903   int ii, dd;
2904   kmp_team_t *team;
2905   kmp_info_t *thr;
2906 
2907   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2908   KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910   // validate level
2911   if (level == 0)
2912     return 1;
2913   if (level < 0)
2914     return -1;
2915   thr = __kmp_threads[gtid];
2916   team = thr->th.th_team;
2917   ii = team->t.t_level;
2918   if (level > ii)
2919     return -1;
2920 
2921 #if OMP_40_ENABLED
2922   if (thr->th.th_teams_microtask) {
2923     // AC: we are in teams region where multiple nested teams have same level
2924     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2925     if (level <=
2926         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2927       KMP_DEBUG_ASSERT(ii >= tlevel);
2928       // AC: As we need to pass by the teams league, we need to artificially
2929       // increase ii
2930       if (ii == tlevel) {
2931         ii += 2; // three teams have same level
2932       } else {
2933         ii++; // two teams have same level
2934       }
2935     }
2936   }
2937 #endif
2938 
2939   while (ii > level) {
2940     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2941     }
2942     if (team->t.t_serialized && (!dd)) {
2943       team = team->t.t_parent;
2944       continue;
2945     }
2946     if (ii > level) {
2947       team = team->t.t_parent;
2948       ii--;
2949     }
2950   }
2951 
2952   return team->t.t_nproc;
2953 }
2954 
2955 kmp_r_sched_t __kmp_get_schedule_global() {
2956   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2957   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2958   // independently. So one can get the updated schedule here.
2959 
2960   kmp_r_sched_t r_sched;
2961 
2962   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2963   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2964   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2965   // different roots (even in OMP 2.5)
2966   if (__kmp_sched == kmp_sch_static) {
2967     // replace STATIC with more detailed schedule (balanced or greedy)
2968     r_sched.r_sched_type = __kmp_static;
2969   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2970     // replace GUIDED with more detailed schedule (iterative or analytical)
2971     r_sched.r_sched_type = __kmp_guided;
2972   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2973     r_sched.r_sched_type = __kmp_sched;
2974   }
2975 
2976   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2977     // __kmp_chunk may be wrong here (if it was not ever set)
2978     r_sched.chunk = KMP_DEFAULT_CHUNK;
2979   } else {
2980     r_sched.chunk = __kmp_chunk;
2981   }
2982 
2983   return r_sched;
2984 }
2985 
2986 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2987    at least argc number of *t_argv entries for the requested team. */
2988 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2989 
2990   KMP_DEBUG_ASSERT(team);
2991   if (!realloc || argc > team->t.t_max_argc) {
2992 
2993     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2994                    "current entries=%d\n",
2995                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2996     /* if previously allocated heap space for args, free them */
2997     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2998       __kmp_free((void *)team->t.t_argv);
2999 
3000     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3001       /* use unused space in the cache line for arguments */
3002       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3003       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3004                      "argv entries\n",
3005                      team->t.t_id, team->t.t_max_argc));
3006       team->t.t_argv = &team->t.t_inline_argv[0];
3007       if (__kmp_storage_map) {
3008         __kmp_print_storage_map_gtid(
3009             -1, &team->t.t_inline_argv[0],
3010             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3011             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3012             team->t.t_id);
3013       }
3014     } else {
3015       /* allocate space for arguments in the heap */
3016       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3017                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3018                                : 2 * argc;
3019       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3020                      "argv entries\n",
3021                      team->t.t_id, team->t.t_max_argc));
3022       team->t.t_argv =
3023           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3024       if (__kmp_storage_map) {
3025         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3026                                      &team->t.t_argv[team->t.t_max_argc],
3027                                      sizeof(void *) * team->t.t_max_argc,
3028                                      "team_%d.t_argv", team->t.t_id);
3029       }
3030     }
3031   }
3032 }
3033 
3034 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3035   int i;
3036   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3037   team->t.t_threads =
3038       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3039   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3040       sizeof(dispatch_shared_info_t) * num_disp_buff);
3041   team->t.t_dispatch =
3042       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3043   team->t.t_implicit_task_taskdata =
3044       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3045   team->t.t_max_nproc = max_nth;
3046 
3047   /* setup dispatch buffers */
3048   for (i = 0; i < num_disp_buff; ++i) {
3049     team->t.t_disp_buffer[i].buffer_index = i;
3050 #if OMP_45_ENABLED
3051     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3052 #endif
3053   }
3054 }
3055 
3056 static void __kmp_free_team_arrays(kmp_team_t *team) {
3057   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3058   int i;
3059   for (i = 0; i < team->t.t_max_nproc; ++i) {
3060     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3061       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3062       team->t.t_dispatch[i].th_disp_buffer = NULL;
3063     }
3064   }
3065   __kmp_free(team->t.t_threads);
3066   __kmp_free(team->t.t_disp_buffer);
3067   __kmp_free(team->t.t_dispatch);
3068   __kmp_free(team->t.t_implicit_task_taskdata);
3069   team->t.t_threads = NULL;
3070   team->t.t_disp_buffer = NULL;
3071   team->t.t_dispatch = NULL;
3072   team->t.t_implicit_task_taskdata = 0;
3073 }
3074 
3075 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3076   kmp_info_t **oldThreads = team->t.t_threads;
3077 
3078   __kmp_free(team->t.t_disp_buffer);
3079   __kmp_free(team->t.t_dispatch);
3080   __kmp_free(team->t.t_implicit_task_taskdata);
3081   __kmp_allocate_team_arrays(team, max_nth);
3082 
3083   KMP_MEMCPY(team->t.t_threads, oldThreads,
3084              team->t.t_nproc * sizeof(kmp_info_t *));
3085 
3086   __kmp_free(oldThreads);
3087 }
3088 
3089 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3090 
3091   kmp_r_sched_t r_sched =
3092       __kmp_get_schedule_global(); // get current state of scheduling globals
3093 
3094 #if OMP_40_ENABLED
3095   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3096 #endif /* OMP_40_ENABLED */
3097 
3098   kmp_internal_control_t g_icvs = {
3099     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3100     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3101     // for nested parallelism (per thread)
3102     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3103     // adjustment of threads (per thread)
3104     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3105     // whether blocktime is explicitly set
3106     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3107 #if KMP_USE_MONITOR
3108     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3109 // intervals
3110 #endif
3111     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3112     // next parallel region (per thread)
3113     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3114     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3115     // for max_active_levels
3116     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3117 // {sched,chunk} pair
3118 #if OMP_40_ENABLED
3119     __kmp_nested_proc_bind.bind_types[0],
3120     __kmp_default_device,
3121 #endif /* OMP_40_ENABLED */
3122     NULL // struct kmp_internal_control *next;
3123   };
3124 
3125   return g_icvs;
3126 }
3127 
3128 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3129 
3130   kmp_internal_control_t gx_icvs;
3131   gx_icvs.serial_nesting_level =
3132       0; // probably =team->t.t_serial like in save_inter_controls
3133   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3134   gx_icvs.next = NULL;
3135 
3136   return gx_icvs;
3137 }
3138 
3139 static void __kmp_initialize_root(kmp_root_t *root) {
3140   int f;
3141   kmp_team_t *root_team;
3142   kmp_team_t *hot_team;
3143   int hot_team_max_nth;
3144   kmp_r_sched_t r_sched =
3145       __kmp_get_schedule_global(); // get current state of scheduling globals
3146   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3147   KMP_DEBUG_ASSERT(root);
3148   KMP_ASSERT(!root->r.r_begin);
3149 
3150   /* setup the root state structure */
3151   __kmp_init_lock(&root->r.r_begin_lock);
3152   root->r.r_begin = FALSE;
3153   root->r.r_active = FALSE;
3154   root->r.r_in_parallel = 0;
3155   root->r.r_blocktime = __kmp_dflt_blocktime;
3156   root->r.r_nested = __kmp_dflt_nested;
3157   root->r.r_cg_nthreads = 1;
3158 
3159   /* setup the root team for this task */
3160   /* allocate the root team structure */
3161   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163   root_team =
3164       __kmp_allocate_team(root,
3165                           1, // new_nproc
3166                           1, // max_nproc
3167 #if OMPT_SUPPORT
3168                           ompt_data_none, // root parallel id
3169 #endif
3170 #if OMP_40_ENABLED
3171                           __kmp_nested_proc_bind.bind_types[0],
3172 #endif
3173                           &r_icvs,
3174                           0 // argc
3175                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3176                           );
3177 #if USE_DEBUGGER
3178   // Non-NULL value should be assigned to make the debugger display the root
3179   // team.
3180   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3181 #endif
3182 
3183   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3184 
3185   root->r.r_root_team = root_team;
3186   root_team->t.t_control_stack_top = NULL;
3187 
3188   /* initialize root team */
3189   root_team->t.t_threads[0] = NULL;
3190   root_team->t.t_nproc = 1;
3191   root_team->t.t_serialized = 1;
3192   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3193   root_team->t.t_sched.sched = r_sched.sched;
3194   KA_TRACE(
3195       20,
3196       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3197        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3198 
3199   /* setup the  hot team for this task */
3200   /* allocate the hot team structure */
3201   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3202 
3203   hot_team =
3204       __kmp_allocate_team(root,
3205                           1, // new_nproc
3206                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3207 #if OMPT_SUPPORT
3208                           ompt_data_none, // root parallel id
3209 #endif
3210 #if OMP_40_ENABLED
3211                           __kmp_nested_proc_bind.bind_types[0],
3212 #endif
3213                           &r_icvs,
3214                           0 // argc
3215                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3216                           );
3217   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3218 
3219   root->r.r_hot_team = hot_team;
3220   root_team->t.t_control_stack_top = NULL;
3221 
3222   /* first-time initialization */
3223   hot_team->t.t_parent = root_team;
3224 
3225   /* initialize hot team */
3226   hot_team_max_nth = hot_team->t.t_max_nproc;
3227   for (f = 0; f < hot_team_max_nth; ++f) {
3228     hot_team->t.t_threads[f] = NULL;
3229   }
3230   hot_team->t.t_nproc = 1;
3231   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3232   hot_team->t.t_sched.sched = r_sched.sched;
3233   hot_team->t.t_size_changed = 0;
3234 }
3235 
3236 #ifdef KMP_DEBUG
3237 
3238 typedef struct kmp_team_list_item {
3239   kmp_team_p const *entry;
3240   struct kmp_team_list_item *next;
3241 } kmp_team_list_item_t;
3242 typedef kmp_team_list_item_t *kmp_team_list_t;
3243 
3244 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3245     kmp_team_list_t list, // List of teams.
3246     kmp_team_p const *team // Team to add.
3247     ) {
3248 
3249   // List must terminate with item where both entry and next are NULL.
3250   // Team is added to the list only once.
3251   // List is sorted in ascending order by team id.
3252   // Team id is *not* a key.
3253 
3254   kmp_team_list_t l;
3255 
3256   KMP_DEBUG_ASSERT(list != NULL);
3257   if (team == NULL) {
3258     return;
3259   }
3260 
3261   __kmp_print_structure_team_accum(list, team->t.t_parent);
3262   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3263 
3264   // Search list for the team.
3265   l = list;
3266   while (l->next != NULL && l->entry != team) {
3267     l = l->next;
3268   }
3269   if (l->next != NULL) {
3270     return; // Team has been added before, exit.
3271   }
3272 
3273   // Team is not found. Search list again for insertion point.
3274   l = list;
3275   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3276     l = l->next;
3277   }
3278 
3279   // Insert team.
3280   {
3281     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3282         sizeof(kmp_team_list_item_t));
3283     *item = *l;
3284     l->entry = team;
3285     l->next = item;
3286   }
3287 }
3288 
3289 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3290 
3291                                        ) {
3292   __kmp_printf("%s", title);
3293   if (team != NULL) {
3294     __kmp_printf("%2x %p\n", team->t.t_id, team);
3295   } else {
3296     __kmp_printf(" - (nil)\n");
3297   }
3298 }
3299 
3300 static void __kmp_print_structure_thread(char const *title,
3301                                          kmp_info_p const *thread) {
3302   __kmp_printf("%s", title);
3303   if (thread != NULL) {
3304     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3305   } else {
3306     __kmp_printf(" - (nil)\n");
3307   }
3308 }
3309 
3310 void __kmp_print_structure(void) {
3311 
3312   kmp_team_list_t list;
3313 
3314   // Initialize list of teams.
3315   list =
3316       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3317   list->entry = NULL;
3318   list->next = NULL;
3319 
3320   __kmp_printf("\n------------------------------\nGlobal Thread "
3321                "Table\n------------------------------\n");
3322   {
3323     int gtid;
3324     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325       __kmp_printf("%2d", gtid);
3326       if (__kmp_threads != NULL) {
3327         __kmp_printf(" %p", __kmp_threads[gtid]);
3328       }
3329       if (__kmp_root != NULL) {
3330         __kmp_printf(" %p", __kmp_root[gtid]);
3331       }
3332       __kmp_printf("\n");
3333     }
3334   }
3335 
3336   // Print out __kmp_threads array.
3337   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3338                "----------\n");
3339   if (__kmp_threads != NULL) {
3340     int gtid;
3341     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342       kmp_info_t const *thread = __kmp_threads[gtid];
3343       if (thread != NULL) {
3344         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3345         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3346         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3347         __kmp_print_structure_team("    Serial Team:  ",
3348                                    thread->th.th_serial_team);
3349         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3350         __kmp_print_structure_thread("    Master:       ",
3351                                      thread->th.th_team_master);
3352         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3353         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3354 #if OMP_40_ENABLED
3355         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3356 #endif
3357         __kmp_print_structure_thread("    Next in pool: ",
3358                                      thread->th.th_next_pool);
3359         __kmp_printf("\n");
3360         __kmp_print_structure_team_accum(list, thread->th.th_team);
3361         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3362       }
3363     }
3364   } else {
3365     __kmp_printf("Threads array is not allocated.\n");
3366   }
3367 
3368   // Print out __kmp_root array.
3369   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3370                "--------\n");
3371   if (__kmp_root != NULL) {
3372     int gtid;
3373     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3374       kmp_root_t const *root = __kmp_root[gtid];
3375       if (root != NULL) {
3376         __kmp_printf("GTID %2d %p:\n", gtid, root);
3377         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3378         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3379         __kmp_print_structure_thread("    Uber Thread:  ",
3380                                      root->r.r_uber_thread);
3381         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3382         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3383         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3384         __kmp_printf("\n");
3385         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3386         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3387       }
3388     }
3389   } else {
3390     __kmp_printf("Ubers array is not allocated.\n");
3391   }
3392 
3393   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3394                "--------\n");
3395   while (list->next != NULL) {
3396     kmp_team_p const *team = list->entry;
3397     int i;
3398     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3399     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3400     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3401     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3402     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3403     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3404     for (i = 0; i < team->t.t_nproc; ++i) {
3405       __kmp_printf("    Thread %2d:      ", i);
3406       __kmp_print_structure_thread("", team->t.t_threads[i]);
3407     }
3408     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3409     __kmp_printf("\n");
3410     list = list->next;
3411   }
3412 
3413   // Print out __kmp_thread_pool and __kmp_team_pool.
3414   __kmp_printf("\n------------------------------\nPools\n----------------------"
3415                "--------\n");
3416   __kmp_print_structure_thread("Thread pool:          ",
3417                                CCAST(kmp_info_t *, __kmp_thread_pool));
3418   __kmp_print_structure_team("Team pool:            ",
3419                              CCAST(kmp_team_t *, __kmp_team_pool));
3420   __kmp_printf("\n");
3421 
3422   // Free team list.
3423   while (list != NULL) {
3424     kmp_team_list_item_t *item = list;
3425     list = list->next;
3426     KMP_INTERNAL_FREE(item);
3427   }
3428 }
3429 
3430 #endif
3431 
3432 //---------------------------------------------------------------------------
3433 //  Stuff for per-thread fast random number generator
3434 //  Table of primes
3435 static const unsigned __kmp_primes[] = {
3436     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3437     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3438     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3439     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3440     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3441     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3442     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3443     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3444     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3445     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3446     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3447 
3448 //---------------------------------------------------------------------------
3449 //  __kmp_get_random: Get a random number using a linear congruential method.
3450 unsigned short __kmp_get_random(kmp_info_t *thread) {
3451   unsigned x = thread->th.th_x;
3452   unsigned short r = x >> 16;
3453 
3454   thread->th.th_x = x * thread->th.th_a + 1;
3455 
3456   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3457                 thread->th.th_info.ds.ds_tid, r));
3458 
3459   return r;
3460 }
3461 //--------------------------------------------------------
3462 // __kmp_init_random: Initialize a random number generator
3463 void __kmp_init_random(kmp_info_t *thread) {
3464   unsigned seed = thread->th.th_info.ds.ds_tid;
3465 
3466   thread->th.th_a =
3467       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3468   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3469   KA_TRACE(30,
3470            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3471 }
3472 
3473 #if KMP_OS_WINDOWS
3474 /* reclaim array entries for root threads that are already dead, returns number
3475  * reclaimed */
3476 static int __kmp_reclaim_dead_roots(void) {
3477   int i, r = 0;
3478 
3479   for (i = 0; i < __kmp_threads_capacity; ++i) {
3480     if (KMP_UBER_GTID(i) &&
3481         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3482         !__kmp_root[i]
3483              ->r.r_active) { // AC: reclaim only roots died in non-active state
3484       r += __kmp_unregister_root_other_thread(i);
3485     }
3486   }
3487   return r;
3488 }
3489 #endif
3490 
3491 /* This function attempts to create free entries in __kmp_threads and
3492    __kmp_root, and returns the number of free entries generated.
3493 
3494    For Windows* OS static library, the first mechanism used is to reclaim array
3495    entries for root threads that are already dead.
3496 
3497    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3498    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3499    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3500    threadprivate cache array has been created. Synchronization with
3501    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3502 
3503    After any dead root reclamation, if the clipping value allows array expansion
3504    to result in the generation of a total of nNeed free slots, the function does
3505    that expansion. If not, nothing is done beyond the possible initial root
3506    thread reclamation.
3507 
3508    If any argument is negative, the behavior is undefined. */
3509 static int __kmp_expand_threads(int nNeed) {
3510   int added = 0;
3511   int minimumRequiredCapacity;
3512   int newCapacity;
3513   kmp_info_t **newThreads;
3514   kmp_root_t **newRoot;
3515 
3516 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3517 // resizing __kmp_threads does not need additional protection if foreign
3518 // threads are present
3519 
3520 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3521   /* only for Windows static library */
3522   /* reclaim array entries for root threads that are already dead */
3523   added = __kmp_reclaim_dead_roots();
3524 
3525   if (nNeed) {
3526     nNeed -= added;
3527     if (nNeed < 0)
3528       nNeed = 0;
3529   }
3530 #endif
3531   if (nNeed <= 0)
3532     return added;
3533 
3534   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3535   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3536   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3537   // > __kmp_max_nth in one of two ways:
3538   //
3539   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3540   //    may not be resused by another thread, so we may need to increase
3541   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3542   //
3543   // 2) New foreign root(s) are encountered.  We always register new foreign
3544   //    roots. This may cause a smaller # of threads to be allocated at
3545   //    subsequent parallel regions, but the worker threads hang around (and
3546   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3547   //
3548   // Anyway, that is the reason for moving the check to see if
3549   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3550   // instead of having it performed here. -BB
3551 
3552   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3553 
3554   /* compute expansion headroom to check if we can expand */
3555   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3556     /* possible expansion too small -- give up */
3557     return added;
3558   }
3559   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3560 
3561   newCapacity = __kmp_threads_capacity;
3562   do {
3563     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3564                                                           : __kmp_sys_max_nth;
3565   } while (newCapacity < minimumRequiredCapacity);
3566   newThreads = (kmp_info_t **)__kmp_allocate(
3567       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3568   newRoot =
3569       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3570   KMP_MEMCPY(newThreads, __kmp_threads,
3571              __kmp_threads_capacity * sizeof(kmp_info_t *));
3572   KMP_MEMCPY(newRoot, __kmp_root,
3573              __kmp_threads_capacity * sizeof(kmp_root_t *));
3574 
3575   kmp_info_t **temp_threads = __kmp_threads;
3576   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3577   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3578   __kmp_free(temp_threads);
3579   added += newCapacity - __kmp_threads_capacity;
3580   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3581 
3582   if (newCapacity > __kmp_tp_capacity) {
3583     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3584     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3585       __kmp_threadprivate_resize_cache(newCapacity);
3586     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3587       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3588     }
3589     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3590   }
3591 
3592   return added;
3593 }
3594 
3595 /* Register the current thread as a root thread and obtain our gtid. We must
3596    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3597    thread that calls from __kmp_do_serial_initialize() */
3598 int __kmp_register_root(int initial_thread) {
3599   kmp_info_t *root_thread;
3600   kmp_root_t *root;
3601   int gtid;
3602   int capacity;
3603   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3604   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3605   KMP_MB();
3606 
3607   /* 2007-03-02:
3608      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3609      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3610      work as expected -- it may return false (that means there is at least one
3611      empty slot in __kmp_threads array), but it is possible the only free slot
3612      is #0, which is reserved for initial thread and so cannot be used for this
3613      one. Following code workarounds this bug.
3614 
3615      However, right solution seems to be not reserving slot #0 for initial
3616      thread because:
3617      (1) there is no magic in slot #0,
3618      (2) we cannot detect initial thread reliably (the first thread which does
3619         serial initialization may be not a real initial thread).
3620   */
3621   capacity = __kmp_threads_capacity;
3622   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3623     --capacity;
3624   }
3625 
3626   /* see if there are too many threads */
3627   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3628     if (__kmp_tp_cached) {
3629       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3630                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3631                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3632     } else {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3634                   __kmp_msg_null);
3635     }
3636   }
3637 
3638   /* find an available thread slot */
3639   /* Don't reassign the zero slot since we need that to only be used by initial
3640      thread */
3641   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3642        gtid++)
3643     ;
3644   KA_TRACE(1,
3645            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3646   KMP_ASSERT(gtid < __kmp_threads_capacity);
3647 
3648   /* update global accounting */
3649   __kmp_all_nth++;
3650   TCW_4(__kmp_nth, __kmp_nth + 1);
3651 
3652   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3653   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3654   if (__kmp_adjust_gtid_mode) {
3655     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3656       if (TCR_4(__kmp_gtid_mode) != 2) {
3657         TCW_4(__kmp_gtid_mode, 2);
3658       }
3659     } else {
3660       if (TCR_4(__kmp_gtid_mode) != 1) {
3661         TCW_4(__kmp_gtid_mode, 1);
3662       }
3663     }
3664   }
3665 
3666 #ifdef KMP_ADJUST_BLOCKTIME
3667   /* Adjust blocktime to zero if necessary            */
3668   /* Middle initialization might not have occurred yet */
3669   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3670     if (__kmp_nth > __kmp_avail_proc) {
3671       __kmp_zero_bt = TRUE;
3672     }
3673   }
3674 #endif /* KMP_ADJUST_BLOCKTIME */
3675 
3676   /* setup this new hierarchy */
3677   if (!(root = __kmp_root[gtid])) {
3678     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3679     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3680   }
3681 
3682 #if KMP_STATS_ENABLED
3683   // Initialize stats as soon as possible (right after gtid assignment).
3684   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3685   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3686   KMP_SET_THREAD_STATE(SERIAL_REGION);
3687   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3688 #endif
3689   __kmp_initialize_root(root);
3690 
3691   /* setup new root thread structure */
3692   if (root->r.r_uber_thread) {
3693     root_thread = root->r.r_uber_thread;
3694   } else {
3695     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3696     if (__kmp_storage_map) {
3697       __kmp_print_thread_storage_map(root_thread, gtid);
3698     }
3699     root_thread->th.th_info.ds.ds_gtid = gtid;
3700 #if OMPT_SUPPORT
3701     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3702 #endif
3703     root_thread->th.th_root = root;
3704     if (__kmp_env_consistency_check) {
3705       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3706     }
3707 #if USE_FAST_MEMORY
3708     __kmp_initialize_fast_memory(root_thread);
3709 #endif /* USE_FAST_MEMORY */
3710 
3711 #if KMP_USE_BGET
3712     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3713     __kmp_initialize_bget(root_thread);
3714 #endif
3715     __kmp_init_random(root_thread); // Initialize random number generator
3716   }
3717 
3718   /* setup the serial team held in reserve by the root thread */
3719   if (!root_thread->th.th_serial_team) {
3720     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3721     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3722     root_thread->th.th_serial_team =
3723         __kmp_allocate_team(root, 1, 1,
3724 #if OMPT_SUPPORT
3725                             ompt_data_none, // root parallel id
3726 #endif
3727 #if OMP_40_ENABLED
3728                             proc_bind_default,
3729 #endif
3730                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3731   }
3732   KMP_ASSERT(root_thread->th.th_serial_team);
3733   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3734                 root_thread->th.th_serial_team));
3735 
3736   /* drop root_thread into place */
3737   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3738 
3739   root->r.r_root_team->t.t_threads[0] = root_thread;
3740   root->r.r_hot_team->t.t_threads[0] = root_thread;
3741   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3742   // AC: the team created in reserve, not for execution (it is unused for now).
3743   root_thread->th.th_serial_team->t.t_serialized = 0;
3744   root->r.r_uber_thread = root_thread;
3745 
3746   /* initialize the thread, get it ready to go */
3747   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3748   TCW_4(__kmp_init_gtid, TRUE);
3749 
3750   /* prepare the master thread for get_gtid() */
3751   __kmp_gtid_set_specific(gtid);
3752 
3753 #if USE_ITT_BUILD
3754   __kmp_itt_thread_name(gtid);
3755 #endif /* USE_ITT_BUILD */
3756 
3757 #ifdef KMP_TDATA_GTID
3758   __kmp_gtid = gtid;
3759 #endif
3760   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3761   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3762 
3763   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3764                 "plain=%u\n",
3765                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3766                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3767                 KMP_INIT_BARRIER_STATE));
3768   { // Initialize barrier data.
3769     int b;
3770     for (b = 0; b < bs_last_barrier; ++b) {
3771       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3772 #if USE_DEBUGGER
3773       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3774 #endif
3775     }
3776   }
3777   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3778                    KMP_INIT_BARRIER_STATE);
3779 
3780 #if KMP_AFFINITY_SUPPORTED
3781 #if OMP_40_ENABLED
3782   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786 #endif
3787 
3788   if (TCR_4(__kmp_init_middle)) {
3789     __kmp_affinity_set_init_mask(gtid, TRUE);
3790   }
3791 #endif /* KMP_AFFINITY_SUPPORTED */
3792 
3793   __kmp_root_counter++;
3794 
3795 #if OMPT_SUPPORT
3796   if (!initial_thread && ompt_enabled.enabled) {
3797 
3798     ompt_thread_t *root_thread = ompt_get_thread();
3799 
3800     ompt_set_thread_state(root_thread, omp_state_overhead);
3801 
3802     if (ompt_enabled.ompt_callback_thread_begin) {
3803       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3804           ompt_thread_initial, __ompt_get_thread_data_internal());
3805     }
3806     ompt_data_t *task_data;
3807     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3808     if (ompt_enabled.ompt_callback_task_create) {
3809       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3810           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3811       // initial task has nothing to return to
3812     }
3813 
3814     ompt_set_thread_state(root_thread, omp_state_work_serial);
3815   }
3816 #endif
3817 
3818   KMP_MB();
3819   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3820 
3821   return gtid;
3822 }
3823 
3824 #if KMP_NESTED_HOT_TEAMS
3825 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3826                                 const int max_level) {
3827   int i, n, nth;
3828   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3829   if (!hot_teams || !hot_teams[level].hot_team) {
3830     return 0;
3831   }
3832   KMP_DEBUG_ASSERT(level < max_level);
3833   kmp_team_t *team = hot_teams[level].hot_team;
3834   nth = hot_teams[level].hot_team_nth;
3835   n = nth - 1; // master is not freed
3836   if (level < max_level - 1) {
3837     for (i = 0; i < nth; ++i) {
3838       kmp_info_t *th = team->t.t_threads[i];
3839       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3840       if (i > 0 && th->th.th_hot_teams) {
3841         __kmp_free(th->th.th_hot_teams);
3842         th->th.th_hot_teams = NULL;
3843       }
3844     }
3845   }
3846   __kmp_free_team(root, team, NULL);
3847   return n;
3848 }
3849 #endif
3850 
3851 // Resets a root thread and clear its root and hot teams.
3852 // Returns the number of __kmp_threads entries directly and indirectly freed.
3853 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3854   kmp_team_t *root_team = root->r.r_root_team;
3855   kmp_team_t *hot_team = root->r.r_hot_team;
3856   int n = hot_team->t.t_nproc;
3857   int i;
3858 
3859   KMP_DEBUG_ASSERT(!root->r.r_active);
3860 
3861   root->r.r_root_team = NULL;
3862   root->r.r_hot_team = NULL;
3863   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3864   // before call to __kmp_free_team().
3865   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3866 #if KMP_NESTED_HOT_TEAMS
3867   if (__kmp_hot_teams_max_level >
3868       0) { // need to free nested hot teams and their threads if any
3869     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3870       kmp_info_t *th = hot_team->t.t_threads[i];
3871       if (__kmp_hot_teams_max_level > 1) {
3872         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3873       }
3874       if (th->th.th_hot_teams) {
3875         __kmp_free(th->th.th_hot_teams);
3876         th->th.th_hot_teams = NULL;
3877       }
3878     }
3879   }
3880 #endif
3881   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3882 
3883   // Before we can reap the thread, we need to make certain that all other
3884   // threads in the teams that had this root as ancestor have stopped trying to
3885   // steal tasks.
3886   if (__kmp_tasking_mode != tskm_immediate_exec) {
3887     __kmp_wait_to_unref_task_teams();
3888   }
3889 
3890 #if KMP_OS_WINDOWS
3891   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3892   KA_TRACE(
3893       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3894            "\n",
3895            (LPVOID) & (root->r.r_uber_thread->th),
3896            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3897   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3898 #endif /* KMP_OS_WINDOWS */
3899 
3900 #if OMPT_SUPPORT
3901   if (ompt_enabled.ompt_callback_thread_end) {
3902     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3903         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3904   }
3905 #endif
3906 
3907   TCW_4(__kmp_nth,
3908         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3909   root->r.r_cg_nthreads--;
3910 
3911   __kmp_reap_thread(root->r.r_uber_thread, 1);
3912 
3913   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3914   // of freeing.
3915   root->r.r_uber_thread = NULL;
3916   /* mark root as no longer in use */
3917   root->r.r_begin = FALSE;
3918 
3919   return n;
3920 }
3921 
3922 void __kmp_unregister_root_current_thread(int gtid) {
3923   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3924   /* this lock should be ok, since unregister_root_current_thread is never
3925      called during an abort, only during a normal close. furthermore, if you
3926      have the forkjoin lock, you should never try to get the initz lock */
3927   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3928   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3929     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3930                   "exiting T#%d\n",
3931                   gtid));
3932     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3933     return;
3934   }
3935   kmp_root_t *root = __kmp_root[gtid];
3936 
3937   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938   KMP_ASSERT(KMP_UBER_GTID(gtid));
3939   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940   KMP_ASSERT(root->r.r_active == FALSE);
3941 
3942   KMP_MB();
3943 
3944 #if OMP_45_ENABLED
3945   kmp_info_t *thread = __kmp_threads[gtid];
3946   kmp_team_t *team = thread->th.th_team;
3947   kmp_task_team_t *task_team = thread->th.th_task_team;
3948 
3949   // we need to wait for the proxy tasks before finishing the thread
3950   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3951 #if OMPT_SUPPORT
3952     // the runtime is shutting down so we won't report any events
3953     thread->th.ompt_thread_info.state = omp_state_undefined;
3954 #endif
3955     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3956   }
3957 #endif
3958 
3959   __kmp_reset_root(gtid, root);
3960 
3961   /* free up this thread slot */
3962   __kmp_gtid_set_specific(KMP_GTID_DNE);
3963 #ifdef KMP_TDATA_GTID
3964   __kmp_gtid = KMP_GTID_DNE;
3965 #endif
3966 
3967   KMP_MB();
3968   KC_TRACE(10,
3969            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3970 
3971   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3972 }
3973 
3974 #if KMP_OS_WINDOWS
3975 /* __kmp_forkjoin_lock must be already held
3976    Unregisters a root thread that is not the current thread.  Returns the number
3977    of __kmp_threads entries freed as a result. */
3978 static int __kmp_unregister_root_other_thread(int gtid) {
3979   kmp_root_t *root = __kmp_root[gtid];
3980   int r;
3981 
3982   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3983   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3984   KMP_ASSERT(KMP_UBER_GTID(gtid));
3985   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3986   KMP_ASSERT(root->r.r_active == FALSE);
3987 
3988   r = __kmp_reset_root(gtid, root);
3989   KC_TRACE(10,
3990            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3991   return r;
3992 }
3993 #endif
3994 
3995 #if KMP_DEBUG
3996 void __kmp_task_info() {
3997 
3998   kmp_int32 gtid = __kmp_entry_gtid();
3999   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4000   kmp_info_t *this_thr = __kmp_threads[gtid];
4001   kmp_team_t *steam = this_thr->th.th_serial_team;
4002   kmp_team_t *team = this_thr->th.th_team;
4003 
4004   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4005                "ptask=%p\n",
4006                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4007                team->t.t_implicit_task_taskdata[tid].td_parent);
4008 }
4009 #endif // KMP_DEBUG
4010 
4011 /* TODO optimize with one big memclr, take out what isn't needed, split
4012    responsibility to workers as much as possible, and delay initialization of
4013    features as much as possible  */
4014 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4015                                   int tid, int gtid) {
4016   /* this_thr->th.th_info.ds.ds_gtid is setup in
4017      kmp_allocate_thread/create_worker.
4018      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4019   kmp_info_t *master = team->t.t_threads[0];
4020   KMP_DEBUG_ASSERT(this_thr != NULL);
4021   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4022   KMP_DEBUG_ASSERT(team);
4023   KMP_DEBUG_ASSERT(team->t.t_threads);
4024   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4025   KMP_DEBUG_ASSERT(master);
4026   KMP_DEBUG_ASSERT(master->th.th_root);
4027 
4028   KMP_MB();
4029 
4030   TCW_SYNC_PTR(this_thr->th.th_team, team);
4031 
4032   this_thr->th.th_info.ds.ds_tid = tid;
4033   this_thr->th.th_set_nproc = 0;
4034   if (__kmp_tasking_mode != tskm_immediate_exec)
4035     // When tasking is possible, threads are not safe to reap until they are
4036     // done tasking; this will be set when tasking code is exited in wait
4037     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4038   else // no tasking --> always safe to reap
4039     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4040 #if OMP_40_ENABLED
4041   this_thr->th.th_set_proc_bind = proc_bind_default;
4042 #if KMP_AFFINITY_SUPPORTED
4043   this_thr->th.th_new_place = this_thr->th.th_current_place;
4044 #endif
4045 #endif
4046   this_thr->th.th_root = master->th.th_root;
4047 
4048   /* setup the thread's cache of the team structure */
4049   this_thr->th.th_team_nproc = team->t.t_nproc;
4050   this_thr->th.th_team_master = master;
4051   this_thr->th.th_team_serialized = team->t.t_serialized;
4052   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4053 
4054   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4055 
4056   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4057                 tid, gtid, this_thr, this_thr->th.th_current_task));
4058 
4059   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4060                            team, tid, TRUE);
4061 
4062   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4063                 tid, gtid, this_thr, this_thr->th.th_current_task));
4064   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4065   // __kmp_initialize_team()?
4066 
4067   /* TODO no worksharing in speculative threads */
4068   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4069 
4070   this_thr->th.th_local.this_construct = 0;
4071 
4072   if (!this_thr->th.th_pri_common) {
4073     this_thr->th.th_pri_common =
4074         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4075     if (__kmp_storage_map) {
4076       __kmp_print_storage_map_gtid(
4077           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4078           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4079     }
4080     this_thr->th.th_pri_head = NULL;
4081   }
4082 
4083   /* Initialize dynamic dispatch */
4084   {
4085     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4086     // Use team max_nproc since this will never change for the team.
4087     size_t disp_size =
4088         sizeof(dispatch_private_info_t) *
4089         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4090     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4091                   team->t.t_max_nproc));
4092     KMP_ASSERT(dispatch);
4093     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4094     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4095 
4096     dispatch->th_disp_index = 0;
4097 #if OMP_45_ENABLED
4098     dispatch->th_doacross_buf_idx = 0;
4099 #endif
4100     if (!dispatch->th_disp_buffer) {
4101       dispatch->th_disp_buffer =
4102           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4103 
4104       if (__kmp_storage_map) {
4105         __kmp_print_storage_map_gtid(
4106             gtid, &dispatch->th_disp_buffer[0],
4107             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4108                                           ? 1
4109                                           : __kmp_dispatch_num_buffers],
4110             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4111                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4112             gtid, team->t.t_id, gtid);
4113       }
4114     } else {
4115       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4116     }
4117 
4118     dispatch->th_dispatch_pr_current = 0;
4119     dispatch->th_dispatch_sh_current = 0;
4120 
4121     dispatch->th_deo_fcn = 0; /* ORDERED     */
4122     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4123   }
4124 
4125   this_thr->th.th_next_pool = NULL;
4126 
4127   if (!this_thr->th.th_task_state_memo_stack) {
4128     size_t i;
4129     this_thr->th.th_task_state_memo_stack =
4130         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4131     this_thr->th.th_task_state_top = 0;
4132     this_thr->th.th_task_state_stack_sz = 4;
4133     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4134          ++i) // zero init the stack
4135       this_thr->th.th_task_state_memo_stack[i] = 0;
4136   }
4137 
4138   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4139   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4140 
4141   KMP_MB();
4142 }
4143 
4144 /* allocate a new thread for the requesting team. this is only called from
4145    within a forkjoin critical section. we will first try to get an available
4146    thread from the thread pool. if none is available, we will fork a new one
4147    assuming we are able to create a new one. this should be assured, as the
4148    caller should check on this first. */
4149 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4150                                   int new_tid) {
4151   kmp_team_t *serial_team;
4152   kmp_info_t *new_thr;
4153   int new_gtid;
4154 
4155   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4156   KMP_DEBUG_ASSERT(root && team);
4157 #if !KMP_NESTED_HOT_TEAMS
4158   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4159 #endif
4160   KMP_MB();
4161 
4162   /* first, try to get one from the thread pool */
4163   if (__kmp_thread_pool) {
4164 
4165     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4166     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4167     if (new_thr == __kmp_thread_pool_insert_pt) {
4168       __kmp_thread_pool_insert_pt = NULL;
4169     }
4170     TCW_4(new_thr->th.th_in_pool, FALSE);
4171     // Don't touch th_active_in_pool or th_active.
4172     // The worker thread adjusts those flags as it sleeps/awakens.
4173     __kmp_thread_pool_nth--;
4174 
4175     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4176                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4177     KMP_ASSERT(!new_thr->th.th_team);
4178     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4179     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4180 
4181     /* setup the thread structure */
4182     __kmp_initialize_info(new_thr, team, new_tid,
4183                           new_thr->th.th_info.ds.ds_gtid);
4184     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4185 
4186     TCW_4(__kmp_nth, __kmp_nth + 1);
4187     root->r.r_cg_nthreads++;
4188 
4189     new_thr->th.th_task_state = 0;
4190     new_thr->th.th_task_state_top = 0;
4191     new_thr->th.th_task_state_stack_sz = 4;
4192 
4193 #ifdef KMP_ADJUST_BLOCKTIME
4194     /* Adjust blocktime back to zero if necessary */
4195     /* Middle initialization might not have occurred yet */
4196     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4197       if (__kmp_nth > __kmp_avail_proc) {
4198         __kmp_zero_bt = TRUE;
4199       }
4200     }
4201 #endif /* KMP_ADJUST_BLOCKTIME */
4202 
4203 #if KMP_DEBUG
4204     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4205     // KMP_BARRIER_PARENT_FLAG.
4206     int b;
4207     kmp_balign_t *balign = new_thr->th.th_bar;
4208     for (b = 0; b < bs_last_barrier; ++b)
4209       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4210 #endif
4211 
4212     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4213                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4214 
4215     KMP_MB();
4216     return new_thr;
4217   }
4218 
4219   /* no, well fork a new one */
4220   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4221   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4222 
4223 #if KMP_USE_MONITOR
4224   // If this is the first worker thread the RTL is creating, then also
4225   // launch the monitor thread.  We try to do this as early as possible.
4226   if (!TCR_4(__kmp_init_monitor)) {
4227     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4228     if (!TCR_4(__kmp_init_monitor)) {
4229       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4230       TCW_4(__kmp_init_monitor, 1);
4231       __kmp_create_monitor(&__kmp_monitor);
4232       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4233 #if KMP_OS_WINDOWS
4234       // AC: wait until monitor has started. This is a fix for CQ232808.
4235       // The reason is that if the library is loaded/unloaded in a loop with
4236       // small (parallel) work in between, then there is high probability that
4237       // monitor thread started after the library shutdown. At shutdown it is
4238       // too late to cope with the problem, because when the master is in
4239       // DllMain (process detach) the monitor has no chances to start (it is
4240       // blocked), and master has no means to inform the monitor that the
4241       // library has gone, because all the memory which the monitor can access
4242       // is going to be released/reset.
4243       while (TCR_4(__kmp_init_monitor) < 2) {
4244         KMP_YIELD(TRUE);
4245       }
4246       KF_TRACE(10, ("after monitor thread has started\n"));
4247 #endif
4248     }
4249     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4250   }
4251 #endif
4252 
4253   KMP_MB();
4254   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4255     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4256   }
4257 
4258   /* allocate space for it. */
4259   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4260 
4261   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4262 
4263   if (__kmp_storage_map) {
4264     __kmp_print_thread_storage_map(new_thr, new_gtid);
4265   }
4266 
4267   // add the reserve serialized team, initialized from the team's master thread
4268   {
4269     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4270     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4271     new_thr->th.th_serial_team = serial_team =
4272         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4273 #if OMPT_SUPPORT
4274                                           ompt_data_none, // root parallel id
4275 #endif
4276 #if OMP_40_ENABLED
4277                                           proc_bind_default,
4278 #endif
4279                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4280   }
4281   KMP_ASSERT(serial_team);
4282   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4283   // execution (it is unused for now).
4284   serial_team->t.t_threads[0] = new_thr;
4285   KF_TRACE(10,
4286            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4287             new_thr));
4288 
4289   /* setup the thread structures */
4290   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4291 
4292 #if USE_FAST_MEMORY
4293   __kmp_initialize_fast_memory(new_thr);
4294 #endif /* USE_FAST_MEMORY */
4295 
4296 #if KMP_USE_BGET
4297   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4298   __kmp_initialize_bget(new_thr);
4299 #endif
4300 
4301   __kmp_init_random(new_thr); // Initialize random number generator
4302 
4303   /* Initialize these only once when thread is grabbed for a team allocation */
4304   KA_TRACE(20,
4305            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4306             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4307 
4308   int b;
4309   kmp_balign_t *balign = new_thr->th.th_bar;
4310   for (b = 0; b < bs_last_barrier; ++b) {
4311     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4312     balign[b].bb.team = NULL;
4313     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4314     balign[b].bb.use_oncore_barrier = 0;
4315   }
4316 
4317   new_thr->th.th_spin_here = FALSE;
4318   new_thr->th.th_next_waiting = 0;
4319 
4320 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4321   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4322   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4323   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4324   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4325 #endif
4326 
4327   TCW_4(new_thr->th.th_in_pool, FALSE);
4328   new_thr->th.th_active_in_pool = FALSE;
4329   TCW_4(new_thr->th.th_active, TRUE);
4330 
4331   /* adjust the global counters */
4332   __kmp_all_nth++;
4333   __kmp_nth++;
4334 
4335   root->r.r_cg_nthreads++;
4336 
4337   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4338   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4339   if (__kmp_adjust_gtid_mode) {
4340     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4341       if (TCR_4(__kmp_gtid_mode) != 2) {
4342         TCW_4(__kmp_gtid_mode, 2);
4343       }
4344     } else {
4345       if (TCR_4(__kmp_gtid_mode) != 1) {
4346         TCW_4(__kmp_gtid_mode, 1);
4347       }
4348     }
4349   }
4350 
4351 #ifdef KMP_ADJUST_BLOCKTIME
4352   /* Adjust blocktime back to zero if necessary       */
4353   /* Middle initialization might not have occurred yet */
4354   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4355     if (__kmp_nth > __kmp_avail_proc) {
4356       __kmp_zero_bt = TRUE;
4357     }
4358   }
4359 #endif /* KMP_ADJUST_BLOCKTIME */
4360 
4361   /* actually fork it and create the new worker thread */
4362   KF_TRACE(
4363       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4364   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4365   KF_TRACE(10,
4366            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4367 
4368   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4369                 new_gtid));
4370   KMP_MB();
4371   return new_thr;
4372 }
4373 
4374 /* Reinitialize team for reuse.
4375    The hot team code calls this case at every fork barrier, so EPCC barrier
4376    test are extremely sensitive to changes in it, esp. writes to the team
4377    struct, which cause a cache invalidation in all threads.
4378    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4379 static void __kmp_reinitialize_team(kmp_team_t *team,
4380                                     kmp_internal_control_t *new_icvs,
4381                                     ident_t *loc) {
4382   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4383                 team->t.t_threads[0], team));
4384   KMP_DEBUG_ASSERT(team && new_icvs);
4385   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4386   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4387 
4388   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4389   // Copy ICVs to the master thread's implicit taskdata
4390   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4391   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4392 
4393   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4394                 team->t.t_threads[0], team));
4395 }
4396 
4397 /* Initialize the team data structure.
4398    This assumes the t_threads and t_max_nproc are already set.
4399    Also, we don't touch the arguments */
4400 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4401                                   kmp_internal_control_t *new_icvs,
4402                                   ident_t *loc) {
4403   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4404 
4405   /* verify */
4406   KMP_DEBUG_ASSERT(team);
4407   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4408   KMP_DEBUG_ASSERT(team->t.t_threads);
4409   KMP_MB();
4410 
4411   team->t.t_master_tid = 0; /* not needed */
4412   /* team->t.t_master_bar;        not needed */
4413   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4414   team->t.t_nproc = new_nproc;
4415 
4416   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4417   team->t.t_next_pool = NULL;
4418   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4419    * up hot team */
4420 
4421   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4422   team->t.t_invoke = NULL; /* not needed */
4423 
4424   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4425   team->t.t_sched.sched = new_icvs->sched.sched;
4426 
4427 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4428   team->t.t_fp_control_saved = FALSE; /* not needed */
4429   team->t.t_x87_fpu_control_word = 0; /* not needed */
4430   team->t.t_mxcsr = 0; /* not needed */
4431 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4432 
4433   team->t.t_construct = 0;
4434 
4435   team->t.t_ordered.dt.t_value = 0;
4436   team->t.t_master_active = FALSE;
4437 
4438   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4439 
4440 #ifdef KMP_DEBUG
4441   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4442 #endif
4443   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4444 
4445   team->t.t_control_stack_top = NULL;
4446 
4447   __kmp_reinitialize_team(team, new_icvs, loc);
4448 
4449   KMP_MB();
4450   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4451 }
4452 
4453 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4454 /* Sets full mask for thread and returns old mask, no changes to structures. */
4455 static void
4456 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4457   if (KMP_AFFINITY_CAPABLE()) {
4458     int status;
4459     if (old_mask != NULL) {
4460       status = __kmp_get_system_affinity(old_mask, TRUE);
4461       int error = errno;
4462       if (status != 0) {
4463         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4464                     __kmp_msg_null);
4465       }
4466     }
4467     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4468   }
4469 }
4470 #endif
4471 
4472 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4473 
4474 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4475 // It calculats the worker + master thread's partition based upon the parent
4476 // thread's partition, and binds each worker to a thread in their partition.
4477 // The master thread's partition should already include its current binding.
4478 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4479   // Copy the master thread's place partion to the team struct
4480   kmp_info_t *master_th = team->t.t_threads[0];
4481   KMP_DEBUG_ASSERT(master_th != NULL);
4482   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4483   int first_place = master_th->th.th_first_place;
4484   int last_place = master_th->th.th_last_place;
4485   int masters_place = master_th->th.th_current_place;
4486   team->t.t_first_place = first_place;
4487   team->t.t_last_place = last_place;
4488 
4489   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4490                 "bound to place %d partition = [%d,%d]\n",
4491                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4492                 team->t.t_id, masters_place, first_place, last_place));
4493 
4494   switch (proc_bind) {
4495 
4496   case proc_bind_default:
4497     // serial teams might have the proc_bind policy set to proc_bind_default. It
4498     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4499     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4500     break;
4501 
4502   case proc_bind_master: {
4503     int f;
4504     int n_th = team->t.t_nproc;
4505     for (f = 1; f < n_th; f++) {
4506       kmp_info_t *th = team->t.t_threads[f];
4507       KMP_DEBUG_ASSERT(th != NULL);
4508       th->th.th_first_place = first_place;
4509       th->th.th_last_place = last_place;
4510       th->th.th_new_place = masters_place;
4511 
4512       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4513                      "partition = [%d,%d]\n",
4514                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4515                      f, masters_place, first_place, last_place));
4516     }
4517   } break;
4518 
4519   case proc_bind_close: {
4520     int f;
4521     int n_th = team->t.t_nproc;
4522     int n_places;
4523     if (first_place <= last_place) {
4524       n_places = last_place - first_place + 1;
4525     } else {
4526       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4527     }
4528     if (n_th <= n_places) {
4529       int place = masters_place;
4530       for (f = 1; f < n_th; f++) {
4531         kmp_info_t *th = team->t.t_threads[f];
4532         KMP_DEBUG_ASSERT(th != NULL);
4533 
4534         if (place == last_place) {
4535           place = first_place;
4536         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4537           place = 0;
4538         } else {
4539           place++;
4540         }
4541         th->th.th_first_place = first_place;
4542         th->th.th_last_place = last_place;
4543         th->th.th_new_place = place;
4544 
4545         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4546                        "partition = [%d,%d]\n",
4547                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4548                        team->t.t_id, f, place, first_place, last_place));
4549       }
4550     } else {
4551       int S, rem, gap, s_count;
4552       S = n_th / n_places;
4553       s_count = 0;
4554       rem = n_th - (S * n_places);
4555       gap = rem > 0 ? n_places / rem : n_places;
4556       int place = masters_place;
4557       int gap_ct = gap;
4558       for (f = 0; f < n_th; f++) {
4559         kmp_info_t *th = team->t.t_threads[f];
4560         KMP_DEBUG_ASSERT(th != NULL);
4561 
4562         th->th.th_first_place = first_place;
4563         th->th.th_last_place = last_place;
4564         th->th.th_new_place = place;
4565         s_count++;
4566 
4567         if ((s_count == S) && rem && (gap_ct == gap)) {
4568           // do nothing, add an extra thread to place on next iteration
4569         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4570           // we added an extra thread to this place; move to next place
4571           if (place == last_place) {
4572             place = first_place;
4573           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4574             place = 0;
4575           } else {
4576             place++;
4577           }
4578           s_count = 0;
4579           gap_ct = 1;
4580           rem--;
4581         } else if (s_count == S) { // place full; don't add extra
4582           if (place == last_place) {
4583             place = first_place;
4584           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4585             place = 0;
4586           } else {
4587             place++;
4588           }
4589           gap_ct++;
4590           s_count = 0;
4591         }
4592 
4593         KA_TRACE(100,
4594                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4595                   "partition = [%d,%d]\n",
4596                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4597                   th->th.th_new_place, first_place, last_place));
4598       }
4599       KMP_DEBUG_ASSERT(place == masters_place);
4600     }
4601   } break;
4602 
4603   case proc_bind_spread: {
4604     int f;
4605     int n_th = team->t.t_nproc;
4606     int n_places;
4607     int thidx;
4608     if (first_place <= last_place) {
4609       n_places = last_place - first_place + 1;
4610     } else {
4611       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4612     }
4613     if (n_th <= n_places) {
4614       int place = -1;
4615 
4616       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4617         int S = n_places / n_th;
4618         int s_count, rem, gap, gap_ct;
4619 
4620         place = masters_place;
4621         rem = n_places - n_th * S;
4622         gap = rem ? n_th / rem : 1;
4623         gap_ct = gap;
4624         thidx = n_th;
4625         if (update_master_only == 1)
4626           thidx = 1;
4627         for (f = 0; f < thidx; f++) {
4628           kmp_info_t *th = team->t.t_threads[f];
4629           KMP_DEBUG_ASSERT(th != NULL);
4630 
4631           th->th.th_first_place = place;
4632           th->th.th_new_place = place;
4633           s_count = 1;
4634           while (s_count < S) {
4635             if (place == last_place) {
4636               place = first_place;
4637             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4638               place = 0;
4639             } else {
4640               place++;
4641             }
4642             s_count++;
4643           }
4644           if (rem && (gap_ct == gap)) {
4645             if (place == last_place) {
4646               place = first_place;
4647             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4648               place = 0;
4649             } else {
4650               place++;
4651             }
4652             rem--;
4653             gap_ct = 0;
4654           }
4655           th->th.th_last_place = place;
4656           gap_ct++;
4657 
4658           if (place == last_place) {
4659             place = first_place;
4660           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4661             place = 0;
4662           } else {
4663             place++;
4664           }
4665 
4666           KA_TRACE(100,
4667                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4668                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4669                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4670                     f, th->th.th_new_place, th->th.th_first_place,
4671                     th->th.th_last_place, __kmp_affinity_num_masks));
4672         }
4673       } else {
4674         /* Having uniform space of available computation places I can create
4675            T partitions of round(P/T) size and put threads into the first
4676            place of each partition. */
4677         double current = static_cast<double>(masters_place);
4678         double spacing =
4679             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4680         int first, last;
4681         kmp_info_t *th;
4682 
4683         thidx = n_th + 1;
4684         if (update_master_only == 1)
4685           thidx = 1;
4686         for (f = 0; f < thidx; f++) {
4687           first = static_cast<int>(current);
4688           last = static_cast<int>(current + spacing) - 1;
4689           KMP_DEBUG_ASSERT(last >= first);
4690           if (first >= n_places) {
4691             if (masters_place) {
4692               first -= n_places;
4693               last -= n_places;
4694               if (first == (masters_place + 1)) {
4695                 KMP_DEBUG_ASSERT(f == n_th);
4696                 first--;
4697               }
4698               if (last == masters_place) {
4699                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4700                 last--;
4701               }
4702             } else {
4703               KMP_DEBUG_ASSERT(f == n_th);
4704               first = 0;
4705               last = 0;
4706             }
4707           }
4708           if (last >= n_places) {
4709             last = (n_places - 1);
4710           }
4711           place = first;
4712           current += spacing;
4713           if (f < n_th) {
4714             KMP_DEBUG_ASSERT(0 <= first);
4715             KMP_DEBUG_ASSERT(n_places > first);
4716             KMP_DEBUG_ASSERT(0 <= last);
4717             KMP_DEBUG_ASSERT(n_places > last);
4718             KMP_DEBUG_ASSERT(last_place >= first_place);
4719             th = team->t.t_threads[f];
4720             KMP_DEBUG_ASSERT(th);
4721             th->th.th_first_place = first;
4722             th->th.th_new_place = place;
4723             th->th.th_last_place = last;
4724 
4725             KA_TRACE(100,
4726                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4727                       "partition = [%d,%d], spacing = %.4f\n",
4728                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4729                       team->t.t_id, f, th->th.th_new_place,
4730                       th->th.th_first_place, th->th.th_last_place, spacing));
4731           }
4732         }
4733       }
4734       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4735     } else {
4736       int S, rem, gap, s_count;
4737       S = n_th / n_places;
4738       s_count = 0;
4739       rem = n_th - (S * n_places);
4740       gap = rem > 0 ? n_places / rem : n_places;
4741       int place = masters_place;
4742       int gap_ct = gap;
4743       thidx = n_th;
4744       if (update_master_only == 1)
4745         thidx = 1;
4746       for (f = 0; f < thidx; f++) {
4747         kmp_info_t *th = team->t.t_threads[f];
4748         KMP_DEBUG_ASSERT(th != NULL);
4749 
4750         th->th.th_first_place = place;
4751         th->th.th_last_place = place;
4752         th->th.th_new_place = place;
4753         s_count++;
4754 
4755         if ((s_count == S) && rem && (gap_ct == gap)) {
4756           // do nothing, add an extra thread to place on next iteration
4757         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4758           // we added an extra thread to this place; move on to next place
4759           if (place == last_place) {
4760             place = first_place;
4761           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4762             place = 0;
4763           } else {
4764             place++;
4765           }
4766           s_count = 0;
4767           gap_ct = 1;
4768           rem--;
4769         } else if (s_count == S) { // place is full; don't add extra thread
4770           if (place == last_place) {
4771             place = first_place;
4772           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4773             place = 0;
4774           } else {
4775             place++;
4776           }
4777           gap_ct++;
4778           s_count = 0;
4779         }
4780 
4781         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4782                        "partition = [%d,%d]\n",
4783                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4784                        team->t.t_id, f, th->th.th_new_place,
4785                        th->th.th_first_place, th->th.th_last_place));
4786       }
4787       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4788     }
4789   } break;
4790 
4791   default:
4792     break;
4793   }
4794 
4795   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4796 }
4797 
4798 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4799 
4800 /* allocate a new team data structure to use.  take one off of the free pool if
4801    available */
4802 kmp_team_t *
4803 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4804 #if OMPT_SUPPORT
4805                     ompt_data_t ompt_parallel_data,
4806 #endif
4807 #if OMP_40_ENABLED
4808                     kmp_proc_bind_t new_proc_bind,
4809 #endif
4810                     kmp_internal_control_t *new_icvs,
4811                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4812   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4813   int f;
4814   kmp_team_t *team;
4815   int use_hot_team = !root->r.r_active;
4816   int level = 0;
4817 
4818   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4819   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4820   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4821   KMP_MB();
4822 
4823 #if KMP_NESTED_HOT_TEAMS
4824   kmp_hot_team_ptr_t *hot_teams;
4825   if (master) {
4826     team = master->th.th_team;
4827     level = team->t.t_active_level;
4828     if (master->th.th_teams_microtask) { // in teams construct?
4829       if (master->th.th_teams_size.nteams > 1 &&
4830           ( // #teams > 1
4831               team->t.t_pkfn ==
4832                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4833               master->th.th_teams_level <
4834                   team->t.t_level)) { // or nested parallel inside the teams
4835         ++level; // not increment if #teams==1, or for outer fork of the teams;
4836         // increment otherwise
4837       }
4838     }
4839     hot_teams = master->th.th_hot_teams;
4840     if (level < __kmp_hot_teams_max_level && hot_teams &&
4841         hot_teams[level]
4842             .hot_team) { // hot team has already been allocated for given level
4843       use_hot_team = 1;
4844     } else {
4845       use_hot_team = 0;
4846     }
4847   }
4848 #endif
4849   // Optimization to use a "hot" team
4850   if (use_hot_team && new_nproc > 1) {
4851     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4852 #if KMP_NESTED_HOT_TEAMS
4853     team = hot_teams[level].hot_team;
4854 #else
4855     team = root->r.r_hot_team;
4856 #endif
4857 #if KMP_DEBUG
4858     if (__kmp_tasking_mode != tskm_immediate_exec) {
4859       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4860                     "task_team[1] = %p before reinit\n",
4861                     team->t.t_task_team[0], team->t.t_task_team[1]));
4862     }
4863 #endif
4864 
4865     // Has the number of threads changed?
4866     /* Let's assume the most common case is that the number of threads is
4867        unchanged, and put that case first. */
4868     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4869       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4870       // This case can mean that omp_set_num_threads() was called and the hot
4871       // team size was already reduced, so we check the special flag
4872       if (team->t.t_size_changed == -1) {
4873         team->t.t_size_changed = 1;
4874       } else {
4875         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4876       }
4877 
4878       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4879       kmp_r_sched_t new_sched = new_icvs->sched;
4880       // set master's schedule as new run-time schedule
4881       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4882 
4883       __kmp_reinitialize_team(team, new_icvs,
4884                               root->r.r_uber_thread->th.th_ident);
4885 
4886       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4887                     team->t.t_threads[0], team));
4888       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4889 
4890 #if OMP_40_ENABLED
4891 #if KMP_AFFINITY_SUPPORTED
4892       if ((team->t.t_size_changed == 0) &&
4893           (team->t.t_proc_bind == new_proc_bind)) {
4894         if (new_proc_bind == proc_bind_spread) {
4895           __kmp_partition_places(
4896               team, 1); // add flag to update only master for spread
4897         }
4898         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4899                        "proc_bind = %d, partition = [%d,%d]\n",
4900                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4901                        team->t.t_last_place));
4902       } else {
4903         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4904         __kmp_partition_places(team);
4905       }
4906 #else
4907       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4908 #endif /* KMP_AFFINITY_SUPPORTED */
4909 #endif /* OMP_40_ENABLED */
4910     } else if (team->t.t_nproc > new_nproc) {
4911       KA_TRACE(20,
4912                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4913                 new_nproc));
4914 
4915       team->t.t_size_changed = 1;
4916 #if KMP_NESTED_HOT_TEAMS
4917       if (__kmp_hot_teams_mode == 0) {
4918         // AC: saved number of threads should correspond to team's value in this
4919         // mode, can be bigger in mode 1, when hot team has threads in reserve
4920         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4921         hot_teams[level].hot_team_nth = new_nproc;
4922 #endif // KMP_NESTED_HOT_TEAMS
4923         /* release the extra threads we don't need any more */
4924         for (f = new_nproc; f < team->t.t_nproc; f++) {
4925           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4926           if (__kmp_tasking_mode != tskm_immediate_exec) {
4927             // When decreasing team size, threads no longer in the team should
4928             // unref task team.
4929             team->t.t_threads[f]->th.th_task_team = NULL;
4930           }
4931           __kmp_free_thread(team->t.t_threads[f]);
4932           team->t.t_threads[f] = NULL;
4933         }
4934 #if KMP_NESTED_HOT_TEAMS
4935       } // (__kmp_hot_teams_mode == 0)
4936       else {
4937         // When keeping extra threads in team, switch threads to wait on own
4938         // b_go flag
4939         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4940           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4941           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4942           for (int b = 0; b < bs_last_barrier; ++b) {
4943             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4944               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4945             }
4946             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4947           }
4948         }
4949       }
4950 #endif // KMP_NESTED_HOT_TEAMS
4951       team->t.t_nproc = new_nproc;
4952       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4953       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4954       __kmp_reinitialize_team(team, new_icvs,
4955                               root->r.r_uber_thread->th.th_ident);
4956 
4957       /* update the remaining threads */
4958       for (f = 0; f < new_nproc; ++f) {
4959         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4960       }
4961       // restore the current task state of the master thread: should be the
4962       // implicit task
4963       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4964                     team->t.t_threads[0], team));
4965 
4966       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4967 
4968 #ifdef KMP_DEBUG
4969       for (f = 0; f < team->t.t_nproc; f++) {
4970         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4971                          team->t.t_threads[f]->th.th_team_nproc ==
4972                              team->t.t_nproc);
4973       }
4974 #endif
4975 
4976 #if OMP_40_ENABLED
4977       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4978 #if KMP_AFFINITY_SUPPORTED
4979       __kmp_partition_places(team);
4980 #endif
4981 #endif
4982     } else { // team->t.t_nproc < new_nproc
4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984       kmp_affin_mask_t *old_mask;
4985       if (KMP_AFFINITY_CAPABLE()) {
4986         KMP_CPU_ALLOC(old_mask);
4987       }
4988 #endif
4989 
4990       KA_TRACE(20,
4991                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4992                 new_nproc));
4993 
4994       team->t.t_size_changed = 1;
4995 
4996 #if KMP_NESTED_HOT_TEAMS
4997       int avail_threads = hot_teams[level].hot_team_nth;
4998       if (new_nproc < avail_threads)
4999         avail_threads = new_nproc;
5000       kmp_info_t **other_threads = team->t.t_threads;
5001       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002         // Adjust barrier data of reserved threads (if any) of the team
5003         // Other data will be set in __kmp_initialize_info() below.
5004         int b;
5005         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006         for (b = 0; b < bs_last_barrier; ++b) {
5007           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009 #if USE_DEBUGGER
5010           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5011 #endif
5012         }
5013       }
5014       if (hot_teams[level].hot_team_nth >= new_nproc) {
5015         // we have all needed threads in reserve, no need to allocate any
5016         // this only possible in mode 1, cannot have reserved threads in mode 0
5017         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018         team->t.t_nproc = new_nproc; // just get reserved threads involved
5019       } else {
5020         // we may have some threads in reserve, but not enough
5021         team->t.t_nproc =
5022             hot_teams[level]
5023                 .hot_team_nth; // get reserved threads involved if any
5024         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025 #endif // KMP_NESTED_HOT_TEAMS
5026         if (team->t.t_max_nproc < new_nproc) {
5027           /* reallocate larger arrays */
5028           __kmp_reallocate_team_arrays(team, new_nproc);
5029           __kmp_reinitialize_team(team, new_icvs, NULL);
5030         }
5031 
5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033         /* Temporarily set full mask for master thread before creation of
5034            workers. The reason is that workers inherit the affinity from master,
5035            so if a lot of workers are created on the single core quickly, they
5036            don't get a chance to set their own affinity for a long time. */
5037         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5038 #endif
5039 
5040         /* allocate new threads for the hot team */
5041         for (f = team->t.t_nproc; f < new_nproc; f++) {
5042           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043           KMP_DEBUG_ASSERT(new_worker);
5044           team->t.t_threads[f] = new_worker;
5045 
5046           KA_TRACE(20,
5047                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5048                     "join=%llu, plain=%llu\n",
5049                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051                     team->t.t_bar[bs_plain_barrier].b_arrived));
5052 
5053           { // Initialize barrier data for new threads.
5054             int b;
5055             kmp_balign_t *balign = new_worker->th.th_bar;
5056             for (b = 0; b < bs_last_barrier; ++b) {
5057               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059                                KMP_BARRIER_PARENT_FLAG);
5060 #if USE_DEBUGGER
5061               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5062 #endif
5063             }
5064           }
5065         }
5066 
5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068         if (KMP_AFFINITY_CAPABLE()) {
5069           /* Restore initial master thread's affinity mask */
5070           __kmp_set_system_affinity(old_mask, TRUE);
5071           KMP_CPU_FREE(old_mask);
5072         }
5073 #endif
5074 #if KMP_NESTED_HOT_TEAMS
5075       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076 #endif // KMP_NESTED_HOT_TEAMS
5077       /* make sure everyone is syncronized */
5078       int old_nproc = team->t.t_nproc; // save old value and use to update only
5079       // new threads below
5080       __kmp_initialize_team(team, new_nproc, new_icvs,
5081                             root->r.r_uber_thread->th.th_ident);
5082 
5083       /* reinitialize the threads */
5084       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085       for (f = 0; f < team->t.t_nproc; ++f)
5086         __kmp_initialize_info(team->t.t_threads[f], team, f,
5087                               __kmp_gtid_from_tid(f, team));
5088       if (level) { // set th_task_state for new threads in nested hot team
5089         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5090         // only need to set the th_task_state for the new threads. th_task_state
5091         // for master thread will not be accurate until after this in
5092         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5093         // correct value.
5094         for (f = old_nproc; f < team->t.t_nproc; ++f)
5095           team->t.t_threads[f]->th.th_task_state =
5096               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5097       } else { // set th_task_state for new threads in non-nested hot team
5098         int old_state =
5099             team->t.t_threads[0]->th.th_task_state; // copy master's state
5100         for (f = old_nproc; f < team->t.t_nproc; ++f)
5101           team->t.t_threads[f]->th.th_task_state = old_state;
5102       }
5103 
5104 #ifdef KMP_DEBUG
5105       for (f = 0; f < team->t.t_nproc; ++f) {
5106         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5107                          team->t.t_threads[f]->th.th_team_nproc ==
5108                              team->t.t_nproc);
5109       }
5110 #endif
5111 
5112 #if OMP_40_ENABLED
5113       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114 #if KMP_AFFINITY_SUPPORTED
5115       __kmp_partition_places(team);
5116 #endif
5117 #endif
5118     } // Check changes in number of threads
5119 
5120 #if OMP_40_ENABLED
5121     kmp_info_t *master = team->t.t_threads[0];
5122     if (master->th.th_teams_microtask) {
5123       for (f = 1; f < new_nproc; ++f) {
5124         // propagate teams construct specific info to workers
5125         kmp_info_t *thr = team->t.t_threads[f];
5126         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5127         thr->th.th_teams_level = master->th.th_teams_level;
5128         thr->th.th_teams_size = master->th.th_teams_size;
5129       }
5130     }
5131 #endif /* OMP_40_ENABLED */
5132 #if KMP_NESTED_HOT_TEAMS
5133     if (level) {
5134       // Sync barrier state for nested hot teams, not needed for outermost hot
5135       // team.
5136       for (f = 1; f < new_nproc; ++f) {
5137         kmp_info_t *thr = team->t.t_threads[f];
5138         int b;
5139         kmp_balign_t *balign = thr->th.th_bar;
5140         for (b = 0; b < bs_last_barrier; ++b) {
5141           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5142           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5143 #if USE_DEBUGGER
5144           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5145 #endif
5146         }
5147       }
5148     }
5149 #endif // KMP_NESTED_HOT_TEAMS
5150 
5151     /* reallocate space for arguments if necessary */
5152     __kmp_alloc_argv_entries(argc, team, TRUE);
5153     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5154     // The hot team re-uses the previous task team,
5155     // if untouched during the previous release->gather phase.
5156 
5157     KF_TRACE(10, (" hot_team = %p\n", team));
5158 
5159 #if KMP_DEBUG
5160     if (__kmp_tasking_mode != tskm_immediate_exec) {
5161       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5162                     "task_team[1] = %p after reinit\n",
5163                     team->t.t_task_team[0], team->t.t_task_team[1]));
5164     }
5165 #endif
5166 
5167 #if OMPT_SUPPORT
5168     __ompt_team_assign_id(team, ompt_parallel_data);
5169 #endif
5170 
5171     KMP_MB();
5172 
5173     return team;
5174   }
5175 
5176   /* next, let's try to take one from the team pool */
5177   KMP_MB();
5178   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5179     /* TODO: consider resizing undersized teams instead of reaping them, now
5180        that we have a resizing mechanism */
5181     if (team->t.t_max_nproc >= max_nproc) {
5182       /* take this team from the team pool */
5183       __kmp_team_pool = team->t.t_next_pool;
5184 
5185       /* setup the team for fresh use */
5186       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5187 
5188       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5189                     "task_team[1] %p to NULL\n",
5190                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5191       team->t.t_task_team[0] = NULL;
5192       team->t.t_task_team[1] = NULL;
5193 
5194       /* reallocate space for arguments if necessary */
5195       __kmp_alloc_argv_entries(argc, team, TRUE);
5196       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5197 
5198       KA_TRACE(
5199           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5200                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5201       { // Initialize barrier data.
5202         int b;
5203         for (b = 0; b < bs_last_barrier; ++b) {
5204           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5205 #if USE_DEBUGGER
5206           team->t.t_bar[b].b_master_arrived = 0;
5207           team->t.t_bar[b].b_team_arrived = 0;
5208 #endif
5209         }
5210       }
5211 
5212 #if OMP_40_ENABLED
5213       team->t.t_proc_bind = new_proc_bind;
5214 #endif
5215 
5216       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5217                     team->t.t_id));
5218 
5219 #if OMPT_SUPPORT
5220       __ompt_team_assign_id(team, ompt_parallel_data);
5221 #endif
5222 
5223       KMP_MB();
5224 
5225       return team;
5226     }
5227 
5228     /* reap team if it is too small, then loop back and check the next one */
5229     // not sure if this is wise, but, will be redone during the hot-teams
5230     // rewrite.
5231     /* TODO: Use technique to find the right size hot-team, don't reap them */
5232     team = __kmp_reap_team(team);
5233     __kmp_team_pool = team;
5234   }
5235 
5236   /* nothing available in the pool, no matter, make a new team! */
5237   KMP_MB();
5238   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5239 
5240   /* and set it up */
5241   team->t.t_max_nproc = max_nproc;
5242   /* NOTE well, for some reason allocating one big buffer and dividing it up
5243      seems to really hurt performance a lot on the P4, so, let's not use this */
5244   __kmp_allocate_team_arrays(team, max_nproc);
5245 
5246   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5247   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5248 
5249   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5250                 "%p to NULL\n",
5251                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5252   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5253   // memory, no need to duplicate
5254   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5255   // memory, no need to duplicate
5256 
5257   if (__kmp_storage_map) {
5258     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5259   }
5260 
5261   /* allocate space for arguments */
5262   __kmp_alloc_argv_entries(argc, team, FALSE);
5263   team->t.t_argc = argc;
5264 
5265   KA_TRACE(20,
5266            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5267             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5268   { // Initialize barrier data.
5269     int b;
5270     for (b = 0; b < bs_last_barrier; ++b) {
5271       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5272 #if USE_DEBUGGER
5273       team->t.t_bar[b].b_master_arrived = 0;
5274       team->t.t_bar[b].b_team_arrived = 0;
5275 #endif
5276     }
5277   }
5278 
5279 #if OMP_40_ENABLED
5280   team->t.t_proc_bind = new_proc_bind;
5281 #endif
5282 
5283 #if OMPT_SUPPORT
5284   __ompt_team_assign_id(team, ompt_parallel_data);
5285   team->t.ompt_serialized_team_info = NULL;
5286 #endif
5287 
5288   KMP_MB();
5289 
5290   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5291                 team->t.t_id));
5292 
5293   return team;
5294 }
5295 
5296 /* TODO implement hot-teams at all levels */
5297 /* TODO implement lazy thread release on demand (disband request) */
5298 
5299 /* free the team.  return it to the team pool.  release all the threads
5300  * associated with it */
5301 void __kmp_free_team(kmp_root_t *root,
5302                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5303   int f;
5304   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5305                 team->t.t_id));
5306 
5307   /* verify state */
5308   KMP_DEBUG_ASSERT(root);
5309   KMP_DEBUG_ASSERT(team);
5310   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5311   KMP_DEBUG_ASSERT(team->t.t_threads);
5312 
5313   int use_hot_team = team == root->r.r_hot_team;
5314 #if KMP_NESTED_HOT_TEAMS
5315   int level;
5316   kmp_hot_team_ptr_t *hot_teams;
5317   if (master) {
5318     level = team->t.t_active_level - 1;
5319     if (master->th.th_teams_microtask) { // in teams construct?
5320       if (master->th.th_teams_size.nteams > 1) {
5321         ++level; // level was not increased in teams construct for
5322         // team_of_masters
5323       }
5324       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5325           master->th.th_teams_level == team->t.t_level) {
5326         ++level; // level was not increased in teams construct for
5327         // team_of_workers before the parallel
5328       } // team->t.t_level will be increased inside parallel
5329     }
5330     hot_teams = master->th.th_hot_teams;
5331     if (level < __kmp_hot_teams_max_level) {
5332       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5333       use_hot_team = 1;
5334     }
5335   }
5336 #endif // KMP_NESTED_HOT_TEAMS
5337 
5338   /* team is done working */
5339   TCW_SYNC_PTR(team->t.t_pkfn,
5340                NULL); // Important for Debugging Support Library.
5341   team->t.t_copyin_counter = 0; // init counter for possible reuse
5342   // Do not reset pointer to parent team to NULL for hot teams.
5343 
5344   /* if we are non-hot team, release our threads */
5345   if (!use_hot_team) {
5346     if (__kmp_tasking_mode != tskm_immediate_exec) {
5347       // Wait for threads to reach reapable state
5348       for (f = 1; f < team->t.t_nproc; ++f) {
5349         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5350         kmp_info_t *th = team->t.t_threads[f];
5351         volatile kmp_uint32 *state = &th->th.th_reap_state;
5352         while (*state != KMP_SAFE_TO_REAP) {
5353 #if KMP_OS_WINDOWS
5354           // On Windows a thread can be killed at any time, check this
5355           DWORD ecode;
5356           if (!__kmp_is_thread_alive(th, &ecode)) {
5357             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5358             break;
5359           }
5360 #endif
5361           // first check if thread is sleeping
5362           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5363           if (fl.is_sleeping())
5364             fl.resume(__kmp_gtid_from_thread(th));
5365           KMP_CPU_PAUSE();
5366         }
5367       }
5368 
5369       // Delete task teams
5370       int tt_idx;
5371       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5372         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5373         if (task_team != NULL) {
5374           for (f = 0; f < team->t.t_nproc;
5375                ++f) { // Have all threads unref task teams
5376             team->t.t_threads[f]->th.th_task_team = NULL;
5377           }
5378           KA_TRACE(
5379               20,
5380               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5381                __kmp_get_gtid(), task_team, team->t.t_id));
5382 #if KMP_NESTED_HOT_TEAMS
5383           __kmp_free_task_team(master, task_team);
5384 #endif
5385           team->t.t_task_team[tt_idx] = NULL;
5386         }
5387       }
5388     }
5389 
5390     // Reset pointer to parent team only for non-hot teams.
5391     team->t.t_parent = NULL;
5392     team->t.t_level = 0;
5393     team->t.t_active_level = 0;
5394 
5395     /* free the worker threads */
5396     for (f = 1; f < team->t.t_nproc; ++f) {
5397       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5398       __kmp_free_thread(team->t.t_threads[f]);
5399       team->t.t_threads[f] = NULL;
5400     }
5401 
5402     /* put the team back in the team pool */
5403     /* TODO limit size of team pool, call reap_team if pool too large */
5404     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5405     __kmp_team_pool = (volatile kmp_team_t *)team;
5406   }
5407 
5408   KMP_MB();
5409 }
5410 
5411 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5412 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5413   kmp_team_t *next_pool = team->t.t_next_pool;
5414 
5415   KMP_DEBUG_ASSERT(team);
5416   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5417   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5418   KMP_DEBUG_ASSERT(team->t.t_threads);
5419   KMP_DEBUG_ASSERT(team->t.t_argv);
5420 
5421   /* TODO clean the threads that are a part of this? */
5422 
5423   /* free stuff */
5424   __kmp_free_team_arrays(team);
5425   if (team->t.t_argv != &team->t.t_inline_argv[0])
5426     __kmp_free((void *)team->t.t_argv);
5427   __kmp_free(team);
5428 
5429   KMP_MB();
5430   return next_pool;
5431 }
5432 
5433 // Free the thread.  Don't reap it, just place it on the pool of available
5434 // threads.
5435 //
5436 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5437 // binding for the affinity mechanism to be useful.
5438 //
5439 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5440 // However, we want to avoid a potential performance problem by always
5441 // scanning through the list to find the correct point at which to insert
5442 // the thread (potential N**2 behavior).  To do this we keep track of the
5443 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5444 // With single-level parallelism, threads will always be added to the tail
5445 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5446 // parallelism, all bets are off and we may need to scan through the entire
5447 // free list.
5448 //
5449 // This change also has a potentially large performance benefit, for some
5450 // applications.  Previously, as threads were freed from the hot team, they
5451 // would be placed back on the free list in inverse order.  If the hot team
5452 // grew back to it's original size, then the freed thread would be placed
5453 // back on the hot team in reverse order.  This could cause bad cache
5454 // locality problems on programs where the size of the hot team regularly
5455 // grew and shrunk.
5456 //
5457 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5458 void __kmp_free_thread(kmp_info_t *this_th) {
5459   int gtid;
5460   kmp_info_t **scan;
5461   kmp_root_t *root = this_th->th.th_root;
5462 
5463   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5464                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5465 
5466   KMP_DEBUG_ASSERT(this_th);
5467 
5468   // When moving thread to pool, switch thread to wait on own b_go flag, and
5469   // uninitialized (NULL team).
5470   int b;
5471   kmp_balign_t *balign = this_th->th.th_bar;
5472   for (b = 0; b < bs_last_barrier; ++b) {
5473     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5474       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5475     balign[b].bb.team = NULL;
5476     balign[b].bb.leaf_kids = 0;
5477   }
5478   this_th->th.th_task_state = 0;
5479   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5480 
5481   /* put thread back on the free pool */
5482   TCW_PTR(this_th->th.th_team, NULL);
5483   TCW_PTR(this_th->th.th_root, NULL);
5484   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5485 
5486   /* If the implicit task assigned to this thread can be used by other threads
5487    * -> multiple threads can share the data and try to free the task at
5488    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5489    * with higher probability when hot team is disabled but can occurs even when
5490    * the hot team is enabled */
5491   __kmp_free_implicit_task(this_th);
5492   this_th->th.th_current_task = NULL;
5493 
5494   // If the __kmp_thread_pool_insert_pt is already past the new insert
5495   // point, then we need to re-scan the entire list.
5496   gtid = this_th->th.th_info.ds.ds_gtid;
5497   if (__kmp_thread_pool_insert_pt != NULL) {
5498     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5499     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5500       __kmp_thread_pool_insert_pt = NULL;
5501     }
5502   }
5503 
5504   // Scan down the list to find the place to insert the thread.
5505   // scan is the address of a link in the list, possibly the address of
5506   // __kmp_thread_pool itself.
5507   //
5508   // In the absence of nested parallism, the for loop will have 0 iterations.
5509   if (__kmp_thread_pool_insert_pt != NULL) {
5510     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5511   } else {
5512     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5513   }
5514   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5515        scan = &((*scan)->th.th_next_pool))
5516     ;
5517 
5518   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5519   // to its address.
5520   TCW_PTR(this_th->th.th_next_pool, *scan);
5521   __kmp_thread_pool_insert_pt = *scan = this_th;
5522   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5523                    (this_th->th.th_info.ds.ds_gtid <
5524                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5525   TCW_4(this_th->th.th_in_pool, TRUE);
5526   __kmp_thread_pool_nth++;
5527 
5528   TCW_4(__kmp_nth, __kmp_nth - 1);
5529   root->r.r_cg_nthreads--;
5530 
5531 #ifdef KMP_ADJUST_BLOCKTIME
5532   /* Adjust blocktime back to user setting or default if necessary */
5533   /* Middle initialization might never have occurred                */
5534   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5535     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5536     if (__kmp_nth <= __kmp_avail_proc) {
5537       __kmp_zero_bt = FALSE;
5538     }
5539   }
5540 #endif /* KMP_ADJUST_BLOCKTIME */
5541 
5542   KMP_MB();
5543 }
5544 
5545 /* ------------------------------------------------------------------------ */
5546 
5547 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5548   int gtid = this_thr->th.th_info.ds.ds_gtid;
5549   /*    void                 *stack_data;*/
5550   kmp_team_t *(*volatile pteam);
5551 
5552   KMP_MB();
5553   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5554 
5555   if (__kmp_env_consistency_check) {
5556     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5557   }
5558 
5559 #if OMPT_SUPPORT
5560   ompt_data_t *thread_data;
5561   if (ompt_enabled.enabled) {
5562     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5563     thread_data->ptr = NULL;
5564 
5565     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5566     this_thr->th.ompt_thread_info.wait_id = 0;
5567     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5568     if (ompt_enabled.ompt_callback_thread_begin) {
5569       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5570           ompt_thread_worker, thread_data);
5571     }
5572   }
5573 #endif
5574 
5575 #if OMPT_SUPPORT
5576   if (ompt_enabled.enabled) {
5577     this_thr->th.ompt_thread_info.state = omp_state_idle;
5578   }
5579 #endif
5580   /* This is the place where threads wait for work */
5581   while (!TCR_4(__kmp_global.g.g_done)) {
5582     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5583     KMP_MB();
5584 
5585     /* wait for work to do */
5586     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5587 
5588     /* No tid yet since not part of a team */
5589     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5590 
5591 #if OMPT_SUPPORT
5592     if (ompt_enabled.enabled) {
5593       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5594     }
5595 #endif
5596 
5597     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5598 
5599     /* have we been allocated? */
5600     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5601       /* we were just woken up, so run our new task */
5602       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5603         int rc;
5604         KA_TRACE(20,
5605                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5606                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5607                   (*pteam)->t.t_pkfn));
5608 
5609         updateHWFPControl(*pteam);
5610 
5611 #if OMPT_SUPPORT
5612         if (ompt_enabled.enabled) {
5613           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5614         }
5615 #endif
5616 
5617         {
5618           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5619           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5620           rc = (*pteam)->t.t_invoke(gtid);
5621         }
5622         KMP_ASSERT(rc);
5623 
5624         KMP_MB();
5625         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5626                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5627                       (*pteam)->t.t_pkfn));
5628       }
5629 #if OMPT_SUPPORT
5630       if (ompt_enabled.enabled) {
5631         /* no frame set while outside task */
5632         __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5633 
5634         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5635       }
5636 #endif
5637       /* join barrier after parallel region */
5638       __kmp_join_barrier(gtid);
5639     }
5640   }
5641   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5642 
5643 #if OMPT_SUPPORT
5644   if (ompt_enabled.ompt_callback_thread_end) {
5645     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5646   }
5647 #endif
5648 
5649   this_thr->th.th_task_team = NULL;
5650   /* run the destructors for the threadprivate data for this thread */
5651   __kmp_common_destroy_gtid(gtid);
5652 
5653   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5654   KMP_MB();
5655   return this_thr;
5656 }
5657 
5658 /* ------------------------------------------------------------------------ */
5659 
5660 void __kmp_internal_end_dest(void *specific_gtid) {
5661 #if KMP_COMPILER_ICC
5662 #pragma warning(push)
5663 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5664 // significant bits
5665 #endif
5666   // Make sure no significant bits are lost
5667   int gtid = (kmp_intptr_t)specific_gtid - 1;
5668 #if KMP_COMPILER_ICC
5669 #pragma warning(pop)
5670 #endif
5671 
5672   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5673   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5674    * this is because 0 is reserved for the nothing-stored case */
5675 
5676   /* josh: One reason for setting the gtid specific data even when it is being
5677      destroyed by pthread is to allow gtid lookup through thread specific data
5678      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5679      that gets executed in the call to __kmp_internal_end_thread, actually
5680      gets the gtid through the thread specific data.  Setting it here seems
5681      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5682      to run smoothly.
5683      todo: get rid of this after we remove the dependence on
5684      __kmp_gtid_get_specific  */
5685   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5686     __kmp_gtid_set_specific(gtid);
5687 #ifdef KMP_TDATA_GTID
5688   __kmp_gtid = gtid;
5689 #endif
5690   __kmp_internal_end_thread(gtid);
5691 }
5692 
5693 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5694 
5695 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5696 // destructors work perfectly, but in real libomp.so I have no evidence it is
5697 // ever called. However, -fini linker option in makefile.mk works fine.
5698 
5699 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5700   __kmp_internal_end_atexit();
5701 }
5702 
5703 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5704 
5705 #endif
5706 
5707 /* [Windows] josh: when the atexit handler is called, there may still be more
5708    than one thread alive */
5709 void __kmp_internal_end_atexit(void) {
5710   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5711   /* [Windows]
5712      josh: ideally, we want to completely shutdown the library in this atexit
5713      handler, but stat code that depends on thread specific data for gtid fails
5714      because that data becomes unavailable at some point during the shutdown, so
5715      we call __kmp_internal_end_thread instead. We should eventually remove the
5716      dependency on __kmp_get_specific_gtid in the stat code and use
5717      __kmp_internal_end_library to cleanly shutdown the library.
5718 
5719      // TODO: Can some of this comment about GVS be removed?
5720      I suspect that the offending stat code is executed when the calling thread
5721      tries to clean up a dead root thread's data structures, resulting in GVS
5722      code trying to close the GVS structures for that thread, but since the stat
5723      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5724      the calling thread is cleaning up itself instead of another thread, it get
5725      confused. This happens because allowing a thread to unregister and cleanup
5726      another thread is a recent modification for addressing an issue.
5727      Based on the current design (20050722), a thread may end up
5728      trying to unregister another thread only if thread death does not trigger
5729      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5730      thread specific data destructor function to detect thread death. For
5731      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5732      is nothing.  Thus, the workaround is applicable only for Windows static
5733      stat library. */
5734   __kmp_internal_end_library(-1);
5735 #if KMP_OS_WINDOWS
5736   __kmp_close_console();
5737 #endif
5738 }
5739 
5740 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5741   // It is assumed __kmp_forkjoin_lock is acquired.
5742 
5743   int gtid;
5744 
5745   KMP_DEBUG_ASSERT(thread != NULL);
5746 
5747   gtid = thread->th.th_info.ds.ds_gtid;
5748 
5749   if (!is_root) {
5750 
5751     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5752       /* Assume the threads are at the fork barrier here */
5753       KA_TRACE(
5754           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5755                gtid));
5756       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5757        * (GEH) */
5758       ANNOTATE_HAPPENS_BEFORE(thread);
5759       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5760       __kmp_release_64(&flag);
5761     }
5762 
5763     // Terminate OS thread.
5764     __kmp_reap_worker(thread);
5765 
5766     // The thread was killed asynchronously.  If it was actively
5767     // spinning in the thread pool, decrement the global count.
5768     //
5769     // There is a small timing hole here - if the worker thread was just waking
5770     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5771     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5772     // the global counter might not get updated.
5773     //
5774     // Currently, this can only happen as the library is unloaded,
5775     // so there are no harmful side effects.
5776     if (thread->th.th_active_in_pool) {
5777       thread->th.th_active_in_pool = FALSE;
5778       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5779       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5780     }
5781 
5782     // Decrement # of [worker] threads in the pool.
5783     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5784     --__kmp_thread_pool_nth;
5785   }
5786 
5787   __kmp_free_implicit_task(thread);
5788 
5789 // Free the fast memory for tasking
5790 #if USE_FAST_MEMORY
5791   __kmp_free_fast_memory(thread);
5792 #endif /* USE_FAST_MEMORY */
5793 
5794   __kmp_suspend_uninitialize_thread(thread);
5795 
5796   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5797   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5798 
5799   --__kmp_all_nth;
5800 // __kmp_nth was decremented when thread is added to the pool.
5801 
5802 #ifdef KMP_ADJUST_BLOCKTIME
5803   /* Adjust blocktime back to user setting or default if necessary */
5804   /* Middle initialization might never have occurred                */
5805   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5806     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5807     if (__kmp_nth <= __kmp_avail_proc) {
5808       __kmp_zero_bt = FALSE;
5809     }
5810   }
5811 #endif /* KMP_ADJUST_BLOCKTIME */
5812 
5813   /* free the memory being used */
5814   if (__kmp_env_consistency_check) {
5815     if (thread->th.th_cons) {
5816       __kmp_free_cons_stack(thread->th.th_cons);
5817       thread->th.th_cons = NULL;
5818     }
5819   }
5820 
5821   if (thread->th.th_pri_common != NULL) {
5822     __kmp_free(thread->th.th_pri_common);
5823     thread->th.th_pri_common = NULL;
5824   }
5825 
5826   if (thread->th.th_task_state_memo_stack != NULL) {
5827     __kmp_free(thread->th.th_task_state_memo_stack);
5828     thread->th.th_task_state_memo_stack = NULL;
5829   }
5830 
5831 #if KMP_USE_BGET
5832   if (thread->th.th_local.bget_data != NULL) {
5833     __kmp_finalize_bget(thread);
5834   }
5835 #endif
5836 
5837 #if KMP_AFFINITY_SUPPORTED
5838   if (thread->th.th_affin_mask != NULL) {
5839     KMP_CPU_FREE(thread->th.th_affin_mask);
5840     thread->th.th_affin_mask = NULL;
5841   }
5842 #endif /* KMP_AFFINITY_SUPPORTED */
5843 
5844   __kmp_reap_team(thread->th.th_serial_team);
5845   thread->th.th_serial_team = NULL;
5846   __kmp_free(thread);
5847 
5848   KMP_MB();
5849 
5850 } // __kmp_reap_thread
5851 
5852 static void __kmp_internal_end(void) {
5853   int i;
5854 
5855   /* First, unregister the library */
5856   __kmp_unregister_library();
5857 
5858 #if KMP_OS_WINDOWS
5859   /* In Win static library, we can't tell when a root actually dies, so we
5860      reclaim the data structures for any root threads that have died but not
5861      unregistered themselves, in order to shut down cleanly.
5862      In Win dynamic library we also can't tell when a thread dies.  */
5863   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5864 // dead roots
5865 #endif
5866 
5867   for (i = 0; i < __kmp_threads_capacity; i++)
5868     if (__kmp_root[i])
5869       if (__kmp_root[i]->r.r_active)
5870         break;
5871   KMP_MB(); /* Flush all pending memory write invalidates.  */
5872   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5873 
5874   if (i < __kmp_threads_capacity) {
5875 #if KMP_USE_MONITOR
5876     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5877     KMP_MB(); /* Flush all pending memory write invalidates.  */
5878 
5879     // Need to check that monitor was initialized before reaping it. If we are
5880     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5881     // __kmp_monitor will appear to contain valid data, but it is only valid in
5882     // the parent process, not the child.
5883     // New behavior (201008): instead of keying off of the flag
5884     // __kmp_init_parallel, the monitor thread creation is keyed off
5885     // of the new flag __kmp_init_monitor.
5886     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5887     if (TCR_4(__kmp_init_monitor)) {
5888       __kmp_reap_monitor(&__kmp_monitor);
5889       TCW_4(__kmp_init_monitor, 0);
5890     }
5891     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5892     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5893 #endif // KMP_USE_MONITOR
5894   } else {
5895 /* TODO move this to cleanup code */
5896 #ifdef KMP_DEBUG
5897     /* make sure that everything has properly ended */
5898     for (i = 0; i < __kmp_threads_capacity; i++) {
5899       if (__kmp_root[i]) {
5900         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5901         //                    there can be uber threads alive here
5902         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5903       }
5904     }
5905 #endif
5906 
5907     KMP_MB();
5908 
5909     // Reap the worker threads.
5910     // This is valid for now, but be careful if threads are reaped sooner.
5911     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5912       // Get the next thread from the pool.
5913       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5914       __kmp_thread_pool = thread->th.th_next_pool;
5915       // Reap it.
5916       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5917       thread->th.th_next_pool = NULL;
5918       thread->th.th_in_pool = FALSE;
5919       __kmp_reap_thread(thread, 0);
5920     }
5921     __kmp_thread_pool_insert_pt = NULL;
5922 
5923     // Reap teams.
5924     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5925       // Get the next team from the pool.
5926       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5927       __kmp_team_pool = team->t.t_next_pool;
5928       // Reap it.
5929       team->t.t_next_pool = NULL;
5930       __kmp_reap_team(team);
5931     }
5932 
5933     __kmp_reap_task_teams();
5934 
5935     for (i = 0; i < __kmp_threads_capacity; ++i) {
5936       // TBD: Add some checking...
5937       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5938     }
5939 
5940     /* Make sure all threadprivate destructors get run by joining with all
5941        worker threads before resetting this flag */
5942     TCW_SYNC_4(__kmp_init_common, FALSE);
5943 
5944     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5945     KMP_MB();
5946 
5947 #if KMP_USE_MONITOR
5948     // See note above: One of the possible fixes for CQ138434 / CQ140126
5949     //
5950     // FIXME: push both code fragments down and CSE them?
5951     // push them into __kmp_cleanup() ?
5952     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5953     if (TCR_4(__kmp_init_monitor)) {
5954       __kmp_reap_monitor(&__kmp_monitor);
5955       TCW_4(__kmp_init_monitor, 0);
5956     }
5957     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5958     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5959 #endif
5960   } /* else !__kmp_global.t_active */
5961   TCW_4(__kmp_init_gtid, FALSE);
5962   KMP_MB(); /* Flush all pending memory write invalidates.  */
5963 
5964   __kmp_cleanup();
5965 #if OMPT_SUPPORT
5966   ompt_fini();
5967 #endif
5968 }
5969 
5970 void __kmp_internal_end_library(int gtid_req) {
5971   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5972   /* this shouldn't be a race condition because __kmp_internal_end() is the
5973      only place to clear __kmp_serial_init */
5974   /* we'll check this later too, after we get the lock */
5975   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5976   // redundaant, because the next check will work in any case.
5977   if (__kmp_global.g.g_abort) {
5978     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5979     /* TODO abort? */
5980     return;
5981   }
5982   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5983     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5984     return;
5985   }
5986 
5987   KMP_MB(); /* Flush all pending memory write invalidates.  */
5988 
5989   /* find out who we are and what we should do */
5990   {
5991     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
5992     KA_TRACE(
5993         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
5994     if (gtid == KMP_GTID_SHUTDOWN) {
5995       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
5996                     "already shutdown\n"));
5997       return;
5998     } else if (gtid == KMP_GTID_MONITOR) {
5999       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6000                     "registered, or system shutdown\n"));
6001       return;
6002     } else if (gtid == KMP_GTID_DNE) {
6003       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6004                     "shutdown\n"));
6005       /* we don't know who we are, but we may still shutdown the library */
6006     } else if (KMP_UBER_GTID(gtid)) {
6007       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6008       if (__kmp_root[gtid]->r.r_active) {
6009         __kmp_global.g.g_abort = -1;
6010         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6011         KA_TRACE(10,
6012                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6013                   gtid));
6014         return;
6015       } else {
6016         KA_TRACE(
6017             10,
6018             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6019         __kmp_unregister_root_current_thread(gtid);
6020       }
6021     } else {
6022 /* worker threads may call this function through the atexit handler, if they
6023  * call exit() */
6024 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6025    TODO: do a thorough shutdown instead */
6026 #ifdef DUMP_DEBUG_ON_EXIT
6027       if (__kmp_debug_buf)
6028         __kmp_dump_debug_buffer();
6029 #endif
6030       return;
6031     }
6032   }
6033   /* synchronize the termination process */
6034   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6035 
6036   /* have we already finished */
6037   if (__kmp_global.g.g_abort) {
6038     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6039     /* TODO abort? */
6040     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6041     return;
6042   }
6043   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6044     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6045     return;
6046   }
6047 
6048   /* We need this lock to enforce mutex between this reading of
6049      __kmp_threads_capacity and the writing by __kmp_register_root.
6050      Alternatively, we can use a counter of roots that is atomically updated by
6051      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6052      __kmp_internal_end_*.  */
6053   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6054 
6055   /* now we can safely conduct the actual termination */
6056   __kmp_internal_end();
6057 
6058   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6059   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6060 
6061   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6062 
6063 #ifdef DUMP_DEBUG_ON_EXIT
6064   if (__kmp_debug_buf)
6065     __kmp_dump_debug_buffer();
6066 #endif
6067 
6068 #if KMP_OS_WINDOWS
6069   __kmp_close_console();
6070 #endif
6071 
6072   __kmp_fini_allocator();
6073 
6074 } // __kmp_internal_end_library
6075 
6076 void __kmp_internal_end_thread(int gtid_req) {
6077   int i;
6078 
6079   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6080   /* this shouldn't be a race condition because __kmp_internal_end() is the
6081    * only place to clear __kmp_serial_init */
6082   /* we'll check this later too, after we get the lock */
6083   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6084   // redundant, because the next check will work in any case.
6085   if (__kmp_global.g.g_abort) {
6086     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6087     /* TODO abort? */
6088     return;
6089   }
6090   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6091     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6092     return;
6093   }
6094 
6095   KMP_MB(); /* Flush all pending memory write invalidates.  */
6096 
6097   /* find out who we are and what we should do */
6098   {
6099     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6100     KA_TRACE(10,
6101              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6102     if (gtid == KMP_GTID_SHUTDOWN) {
6103       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6104                     "already shutdown\n"));
6105       return;
6106     } else if (gtid == KMP_GTID_MONITOR) {
6107       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6108                     "registered, or system shutdown\n"));
6109       return;
6110     } else if (gtid == KMP_GTID_DNE) {
6111       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6112                     "shutdown\n"));
6113       return;
6114       /* we don't know who we are */
6115     } else if (KMP_UBER_GTID(gtid)) {
6116       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6117       if (__kmp_root[gtid]->r.r_active) {
6118         __kmp_global.g.g_abort = -1;
6119         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6120         KA_TRACE(10,
6121                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6122                   gtid));
6123         return;
6124       } else {
6125         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6126                       gtid));
6127         __kmp_unregister_root_current_thread(gtid);
6128       }
6129     } else {
6130       /* just a worker thread, let's leave */
6131       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6132 
6133       if (gtid >= 0) {
6134         __kmp_threads[gtid]->th.th_task_team = NULL;
6135       }
6136 
6137       KA_TRACE(10,
6138                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6139                 gtid));
6140       return;
6141     }
6142   }
6143 #if defined KMP_DYNAMIC_LIB
6144   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6145   // thread, because we will better shutdown later in the library destructor.
6146   // The reason of this change is performance problem when non-openmp thread in
6147   // a loop forks and joins many openmp threads. We can save a lot of time
6148   // keeping worker threads alive until the program shutdown.
6149   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6150   // and Windows(DPD200287443) that occurs when using critical sections from
6151   // foreign threads.
6152   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6153   return;
6154 #endif
6155   /* synchronize the termination process */
6156   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6157 
6158   /* have we already finished */
6159   if (__kmp_global.g.g_abort) {
6160     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6161     /* TODO abort? */
6162     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6163     return;
6164   }
6165   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6166     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6167     return;
6168   }
6169 
6170   /* We need this lock to enforce mutex between this reading of
6171      __kmp_threads_capacity and the writing by __kmp_register_root.
6172      Alternatively, we can use a counter of roots that is atomically updated by
6173      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6174      __kmp_internal_end_*.  */
6175 
6176   /* should we finish the run-time?  are all siblings done? */
6177   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6178 
6179   for (i = 0; i < __kmp_threads_capacity; ++i) {
6180     if (KMP_UBER_GTID(i)) {
6181       KA_TRACE(
6182           10,
6183           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6184       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6185       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6186       return;
6187     }
6188   }
6189 
6190   /* now we can safely conduct the actual termination */
6191 
6192   __kmp_internal_end();
6193 
6194   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6195   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6196 
6197   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6198 
6199 #ifdef DUMP_DEBUG_ON_EXIT
6200   if (__kmp_debug_buf)
6201     __kmp_dump_debug_buffer();
6202 #endif
6203 } // __kmp_internal_end_thread
6204 
6205 // -----------------------------------------------------------------------------
6206 // Library registration stuff.
6207 
6208 static long __kmp_registration_flag = 0;
6209 // Random value used to indicate library initialization.
6210 static char *__kmp_registration_str = NULL;
6211 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6212 
6213 static inline char *__kmp_reg_status_name() {
6214   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6215      each thread. If registration and unregistration go in different threads
6216      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6217      env var can not be found, because the name will contain different pid. */
6218   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6219 } // __kmp_reg_status_get
6220 
6221 void __kmp_register_library_startup(void) {
6222 
6223   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6224   int done = 0;
6225   union {
6226     double dtime;
6227     long ltime;
6228   } time;
6229 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6230   __kmp_initialize_system_tick();
6231 #endif
6232   __kmp_read_system_time(&time.dtime);
6233   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6234   __kmp_registration_str =
6235       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6236                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6237 
6238   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6239                 __kmp_registration_str));
6240 
6241   while (!done) {
6242 
6243     char *value = NULL; // Actual value of the environment variable.
6244 
6245     // Set environment variable, but do not overwrite if it is exist.
6246     __kmp_env_set(name, __kmp_registration_str, 0);
6247     // Check the variable is written.
6248     value = __kmp_env_get(name);
6249     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6250 
6251       done = 1; // Ok, environment variable set successfully, exit the loop.
6252 
6253     } else {
6254 
6255       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6256       // Check whether it alive or dead.
6257       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6258       char *tail = value;
6259       char *flag_addr_str = NULL;
6260       char *flag_val_str = NULL;
6261       char const *file_name = NULL;
6262       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6263       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6264       file_name = tail;
6265       if (tail != NULL) {
6266         long *flag_addr = 0;
6267         long flag_val = 0;
6268         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6269         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6270         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6271           // First, check whether environment-encoded address is mapped into
6272           // addr space.
6273           // If so, dereference it to see if it still has the right value.
6274           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6275             neighbor = 1;
6276           } else {
6277             // If not, then we know the other copy of the library is no longer
6278             // running.
6279             neighbor = 2;
6280           }
6281         }
6282       }
6283       switch (neighbor) {
6284       case 0: // Cannot parse environment variable -- neighbor status unknown.
6285         // Assume it is the incompatible format of future version of the
6286         // library. Assume the other library is alive.
6287         // WARN( ... ); // TODO: Issue a warning.
6288         file_name = "unknown library";
6289       // Attention! Falling to the next case. That's intentional.
6290       case 1: { // Neighbor is alive.
6291         // Check it is allowed.
6292         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6293         if (!__kmp_str_match_true(duplicate_ok)) {
6294           // That's not allowed. Issue fatal error.
6295           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6296                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6297         }
6298         KMP_INTERNAL_FREE(duplicate_ok);
6299         __kmp_duplicate_library_ok = 1;
6300         done = 1; // Exit the loop.
6301       } break;
6302       case 2: { // Neighbor is dead.
6303         // Clear the variable and try to register library again.
6304         __kmp_env_unset(name);
6305       } break;
6306       default: { KMP_DEBUG_ASSERT(0); } break;
6307       }
6308     }
6309     KMP_INTERNAL_FREE((void *)value);
6310   }
6311   KMP_INTERNAL_FREE((void *)name);
6312 
6313 } // func __kmp_register_library_startup
6314 
6315 void __kmp_unregister_library(void) {
6316 
6317   char *name = __kmp_reg_status_name();
6318   char *value = __kmp_env_get(name);
6319 
6320   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6321   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6322   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6323     // Ok, this is our variable. Delete it.
6324     __kmp_env_unset(name);
6325   }
6326 
6327   KMP_INTERNAL_FREE(__kmp_registration_str);
6328   KMP_INTERNAL_FREE(value);
6329   KMP_INTERNAL_FREE(name);
6330 
6331   __kmp_registration_flag = 0;
6332   __kmp_registration_str = NULL;
6333 
6334 } // __kmp_unregister_library
6335 
6336 // End of Library registration stuff.
6337 // -----------------------------------------------------------------------------
6338 
6339 #if KMP_MIC_SUPPORTED
6340 
6341 static void __kmp_check_mic_type() {
6342   kmp_cpuid_t cpuid_state = {0};
6343   kmp_cpuid_t *cs_p = &cpuid_state;
6344   __kmp_x86_cpuid(1, 0, cs_p);
6345   // We don't support mic1 at the moment
6346   if ((cs_p->eax & 0xff0) == 0xB10) {
6347     __kmp_mic_type = mic2;
6348   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6349     __kmp_mic_type = mic3;
6350   } else {
6351     __kmp_mic_type = non_mic;
6352   }
6353 }
6354 
6355 #endif /* KMP_MIC_SUPPORTED */
6356 
6357 static void __kmp_do_serial_initialize(void) {
6358   int i, gtid;
6359   int size;
6360 
6361   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6362 
6363   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6364   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6365   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6366   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6367   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6368 
6369 #if OMPT_SUPPORT
6370   ompt_pre_init();
6371 #endif
6372 
6373   __kmp_validate_locks();
6374 
6375   /* Initialize internal memory allocator */
6376   __kmp_init_allocator();
6377 
6378   /* Register the library startup via an environment variable and check to see
6379      whether another copy of the library is already registered. */
6380 
6381   __kmp_register_library_startup();
6382 
6383   /* TODO reinitialization of library */
6384   if (TCR_4(__kmp_global.g.g_done)) {
6385     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6386   }
6387 
6388   __kmp_global.g.g_abort = 0;
6389   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6390 
6391 /* initialize the locks */
6392 #if KMP_USE_ADAPTIVE_LOCKS
6393 #if KMP_DEBUG_ADAPTIVE_LOCKS
6394   __kmp_init_speculative_stats();
6395 #endif
6396 #endif
6397 #if KMP_STATS_ENABLED
6398   __kmp_stats_init();
6399 #endif
6400   __kmp_init_lock(&__kmp_global_lock);
6401   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6402   __kmp_init_lock(&__kmp_debug_lock);
6403   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6404   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6405   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6406   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6407   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6408   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6409   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6410   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6411   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6412   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6413   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6414   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6415   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6416   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6417   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6418 #if KMP_USE_MONITOR
6419   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6420 #endif
6421   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6422 
6423   /* conduct initialization and initial setup of configuration */
6424 
6425   __kmp_runtime_initialize();
6426 
6427 #if KMP_MIC_SUPPORTED
6428   __kmp_check_mic_type();
6429 #endif
6430 
6431 // Some global variable initialization moved here from kmp_env_initialize()
6432 #ifdef KMP_DEBUG
6433   kmp_diag = 0;
6434 #endif
6435   __kmp_abort_delay = 0;
6436 
6437   // From __kmp_init_dflt_team_nth()
6438   /* assume the entire machine will be used */
6439   __kmp_dflt_team_nth_ub = __kmp_xproc;
6440   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6441     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6442   }
6443   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6444     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6445   }
6446   __kmp_max_nth = __kmp_sys_max_nth;
6447   __kmp_cg_max_nth = __kmp_sys_max_nth;
6448   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6449   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6450     __kmp_teams_max_nth = __kmp_sys_max_nth;
6451   }
6452 
6453   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6454   // part
6455   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6456 #if KMP_USE_MONITOR
6457   __kmp_monitor_wakeups =
6458       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6459   __kmp_bt_intervals =
6460       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6461 #endif
6462   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6463   __kmp_library = library_throughput;
6464   // From KMP_SCHEDULE initialization
6465   __kmp_static = kmp_sch_static_balanced;
6466 // AC: do not use analytical here, because it is non-monotonous
6467 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6468 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6469 // need to repeat assignment
6470 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6471 // bit control and barrier method control parts
6472 #if KMP_FAST_REDUCTION_BARRIER
6473 #define kmp_reduction_barrier_gather_bb ((int)1)
6474 #define kmp_reduction_barrier_release_bb ((int)1)
6475 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6476 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6477 #endif // KMP_FAST_REDUCTION_BARRIER
6478   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6479     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6480     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6481     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6482     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6483 #if KMP_FAST_REDUCTION_BARRIER
6484     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6485       // lin_64 ): hyper,1
6486       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6487       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6488       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6489       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6490     }
6491 #endif // KMP_FAST_REDUCTION_BARRIER
6492   }
6493 #if KMP_FAST_REDUCTION_BARRIER
6494 #undef kmp_reduction_barrier_release_pat
6495 #undef kmp_reduction_barrier_gather_pat
6496 #undef kmp_reduction_barrier_release_bb
6497 #undef kmp_reduction_barrier_gather_bb
6498 #endif // KMP_FAST_REDUCTION_BARRIER
6499 #if KMP_MIC_SUPPORTED
6500   if (__kmp_mic_type == mic2) { // KNC
6501     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6502     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6503     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6504         1; // forkjoin release
6505     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6506     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6507   }
6508 #if KMP_FAST_REDUCTION_BARRIER
6509   if (__kmp_mic_type == mic2) { // KNC
6510     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6511     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6512   }
6513 #endif // KMP_FAST_REDUCTION_BARRIER
6514 #endif // KMP_MIC_SUPPORTED
6515 
6516 // From KMP_CHECKS initialization
6517 #ifdef KMP_DEBUG
6518   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6519 #else
6520   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6521 #endif
6522 
6523   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6524   __kmp_foreign_tp = TRUE;
6525 
6526   __kmp_global.g.g_dynamic = FALSE;
6527   __kmp_global.g.g_dynamic_mode = dynamic_default;
6528 
6529   __kmp_env_initialize(NULL);
6530 
6531 // Print all messages in message catalog for testing purposes.
6532 #ifdef KMP_DEBUG
6533   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6534   if (__kmp_str_match_true(val)) {
6535     kmp_str_buf_t buffer;
6536     __kmp_str_buf_init(&buffer);
6537     __kmp_i18n_dump_catalog(&buffer);
6538     __kmp_printf("%s", buffer.str);
6539     __kmp_str_buf_free(&buffer);
6540   }
6541   __kmp_env_free(&val);
6542 #endif
6543 
6544   __kmp_threads_capacity =
6545       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6546   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6547   __kmp_tp_capacity = __kmp_default_tp_capacity(
6548       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6549 
6550   // If the library is shut down properly, both pools must be NULL. Just in
6551   // case, set them to NULL -- some memory may leak, but subsequent code will
6552   // work even if pools are not freed.
6553   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6554   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6555   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6556   __kmp_thread_pool = NULL;
6557   __kmp_thread_pool_insert_pt = NULL;
6558   __kmp_team_pool = NULL;
6559 
6560   /* Allocate all of the variable sized records */
6561   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6562    * expandable */
6563   /* Since allocation is cache-aligned, just add extra padding at the end */
6564   size =
6565       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6566       CACHE_LINE;
6567   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6568   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6569                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6570 
6571   /* init thread counts */
6572   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6573                    0); // Asserts fail if the library is reinitializing and
6574   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6575   __kmp_all_nth = 0;
6576   __kmp_nth = 0;
6577 
6578   /* setup the uber master thread and hierarchy */
6579   gtid = __kmp_register_root(TRUE);
6580   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6581   KMP_ASSERT(KMP_UBER_GTID(gtid));
6582   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6583 
6584   KMP_MB(); /* Flush all pending memory write invalidates.  */
6585 
6586   __kmp_common_initialize();
6587 
6588 #if KMP_OS_UNIX
6589   /* invoke the child fork handler */
6590   __kmp_register_atfork();
6591 #endif
6592 
6593 #if !defined KMP_DYNAMIC_LIB
6594   {
6595     /* Invoke the exit handler when the program finishes, only for static
6596        library. For dynamic library, we already have _fini and DllMain. */
6597     int rc = atexit(__kmp_internal_end_atexit);
6598     if (rc != 0) {
6599       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6600                   __kmp_msg_null);
6601     }
6602   }
6603 #endif
6604 
6605 #if KMP_HANDLE_SIGNALS
6606 #if KMP_OS_UNIX
6607   /* NOTE: make sure that this is called before the user installs their own
6608      signal handlers so that the user handlers are called first. this way they
6609      can return false, not call our handler, avoid terminating the library, and
6610      continue execution where they left off. */
6611   __kmp_install_signals(FALSE);
6612 #endif /* KMP_OS_UNIX */
6613 #if KMP_OS_WINDOWS
6614   __kmp_install_signals(TRUE);
6615 #endif /* KMP_OS_WINDOWS */
6616 #endif
6617 
6618   /* we have finished the serial initialization */
6619   __kmp_init_counter++;
6620 
6621   __kmp_init_serial = TRUE;
6622 
6623   if (__kmp_settings) {
6624     __kmp_env_print();
6625   }
6626 
6627 #if OMP_40_ENABLED
6628   if (__kmp_display_env || __kmp_display_env_verbose) {
6629     __kmp_env_print_2();
6630   }
6631 #endif // OMP_40_ENABLED
6632 
6633 #if OMPT_SUPPORT
6634   ompt_post_init();
6635 #endif
6636 
6637   KMP_MB();
6638 
6639   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6640 }
6641 
6642 void __kmp_serial_initialize(void) {
6643   if (__kmp_init_serial) {
6644     return;
6645   }
6646   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6647   if (__kmp_init_serial) {
6648     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6649     return;
6650   }
6651   __kmp_do_serial_initialize();
6652   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6653 }
6654 
6655 static void __kmp_do_middle_initialize(void) {
6656   int i, j;
6657   int prev_dflt_team_nth;
6658 
6659   if (!__kmp_init_serial) {
6660     __kmp_do_serial_initialize();
6661   }
6662 
6663   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6664 
6665   // Save the previous value for the __kmp_dflt_team_nth so that
6666   // we can avoid some reinitialization if it hasn't changed.
6667   prev_dflt_team_nth = __kmp_dflt_team_nth;
6668 
6669 #if KMP_AFFINITY_SUPPORTED
6670   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6671   // number of cores on the machine.
6672   __kmp_affinity_initialize();
6673 
6674   // Run through the __kmp_threads array and set the affinity mask
6675   // for each root thread that is currently registered with the RTL.
6676   for (i = 0; i < __kmp_threads_capacity; i++) {
6677     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6678       __kmp_affinity_set_init_mask(i, TRUE);
6679     }
6680   }
6681 #endif /* KMP_AFFINITY_SUPPORTED */
6682 
6683   KMP_ASSERT(__kmp_xproc > 0);
6684   if (__kmp_avail_proc == 0) {
6685     __kmp_avail_proc = __kmp_xproc;
6686   }
6687 
6688   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6689   // correct them now
6690   j = 0;
6691   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6692     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6693         __kmp_avail_proc;
6694     j++;
6695   }
6696 
6697   if (__kmp_dflt_team_nth == 0) {
6698 #ifdef KMP_DFLT_NTH_CORES
6699     // Default #threads = #cores
6700     __kmp_dflt_team_nth = __kmp_ncores;
6701     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6702                   "__kmp_ncores (%d)\n",
6703                   __kmp_dflt_team_nth));
6704 #else
6705     // Default #threads = #available OS procs
6706     __kmp_dflt_team_nth = __kmp_avail_proc;
6707     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6708                   "__kmp_avail_proc(%d)\n",
6709                   __kmp_dflt_team_nth));
6710 #endif /* KMP_DFLT_NTH_CORES */
6711   }
6712 
6713   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6714     __kmp_dflt_team_nth = KMP_MIN_NTH;
6715   }
6716   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6717     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6718   }
6719 
6720   // There's no harm in continuing if the following check fails,
6721   // but it indicates an error in the previous logic.
6722   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6723 
6724   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6725     // Run through the __kmp_threads array and set the num threads icv for each
6726     // root thread that is currently registered with the RTL (which has not
6727     // already explicitly set its nthreads-var with a call to
6728     // omp_set_num_threads()).
6729     for (i = 0; i < __kmp_threads_capacity; i++) {
6730       kmp_info_t *thread = __kmp_threads[i];
6731       if (thread == NULL)
6732         continue;
6733       if (thread->th.th_current_task->td_icvs.nproc != 0)
6734         continue;
6735 
6736       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6737     }
6738   }
6739   KA_TRACE(
6740       20,
6741       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6742        __kmp_dflt_team_nth));
6743 
6744 #ifdef KMP_ADJUST_BLOCKTIME
6745   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6746   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6747     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6748     if (__kmp_nth > __kmp_avail_proc) {
6749       __kmp_zero_bt = TRUE;
6750     }
6751   }
6752 #endif /* KMP_ADJUST_BLOCKTIME */
6753 
6754   /* we have finished middle initialization */
6755   TCW_SYNC_4(__kmp_init_middle, TRUE);
6756 
6757   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6758 }
6759 
6760 void __kmp_middle_initialize(void) {
6761   if (__kmp_init_middle) {
6762     return;
6763   }
6764   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6765   if (__kmp_init_middle) {
6766     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6767     return;
6768   }
6769   __kmp_do_middle_initialize();
6770   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6771 }
6772 
6773 void __kmp_parallel_initialize(void) {
6774   int gtid = __kmp_entry_gtid(); // this might be a new root
6775 
6776   /* synchronize parallel initialization (for sibling) */
6777   if (TCR_4(__kmp_init_parallel))
6778     return;
6779   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6780   if (TCR_4(__kmp_init_parallel)) {
6781     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6782     return;
6783   }
6784 
6785   /* TODO reinitialization after we have already shut down */
6786   if (TCR_4(__kmp_global.g.g_done)) {
6787     KA_TRACE(
6788         10,
6789         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6790     __kmp_infinite_loop();
6791   }
6792 
6793   /* jc: The lock __kmp_initz_lock is already held, so calling
6794      __kmp_serial_initialize would cause a deadlock.  So we call
6795      __kmp_do_serial_initialize directly. */
6796   if (!__kmp_init_middle) {
6797     __kmp_do_middle_initialize();
6798   }
6799 
6800   /* begin initialization */
6801   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6802   KMP_ASSERT(KMP_UBER_GTID(gtid));
6803 
6804 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6805   // Save the FP control regs.
6806   // Worker threads will set theirs to these values at thread startup.
6807   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6808   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6809   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6810 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6811 
6812 #if KMP_OS_UNIX
6813 #if KMP_HANDLE_SIGNALS
6814   /*  must be after __kmp_serial_initialize  */
6815   __kmp_install_signals(TRUE);
6816 #endif
6817 #endif
6818 
6819   __kmp_suspend_initialize();
6820 
6821 #if defined(USE_LOAD_BALANCE)
6822   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6823     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6824   }
6825 #else
6826   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6827     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6828   }
6829 #endif
6830 
6831   if (__kmp_version) {
6832     __kmp_print_version_2();
6833   }
6834 
6835   /* we have finished parallel initialization */
6836   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6837 
6838   KMP_MB();
6839   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6840 
6841   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6842 }
6843 
6844 /* ------------------------------------------------------------------------ */
6845 
6846 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6847                                    kmp_team_t *team) {
6848   kmp_disp_t *dispatch;
6849 
6850   KMP_MB();
6851 
6852   /* none of the threads have encountered any constructs, yet. */
6853   this_thr->th.th_local.this_construct = 0;
6854 #if KMP_CACHE_MANAGE
6855   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6856 #endif /* KMP_CACHE_MANAGE */
6857   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6858   KMP_DEBUG_ASSERT(dispatch);
6859   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6860   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6861   // this_thr->th.th_info.ds.ds_tid ] );
6862 
6863   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6864 #if OMP_45_ENABLED
6865   dispatch->th_doacross_buf_idx =
6866       0; /* reset the doacross dispatch buffer counter */
6867 #endif
6868   if (__kmp_env_consistency_check)
6869     __kmp_push_parallel(gtid, team->t.t_ident);
6870 
6871   KMP_MB(); /* Flush all pending memory write invalidates.  */
6872 }
6873 
6874 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6875                                   kmp_team_t *team) {
6876   if (__kmp_env_consistency_check)
6877     __kmp_pop_parallel(gtid, team->t.t_ident);
6878 
6879   __kmp_finish_implicit_task(this_thr);
6880 }
6881 
6882 int __kmp_invoke_task_func(int gtid) {
6883   int rc;
6884   int tid = __kmp_tid_from_gtid(gtid);
6885   kmp_info_t *this_thr = __kmp_threads[gtid];
6886   kmp_team_t *team = this_thr->th.th_team;
6887 
6888   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6889 #if USE_ITT_BUILD
6890   if (__itt_stack_caller_create_ptr) {
6891     __kmp_itt_stack_callee_enter(
6892         (__itt_caller)
6893             team->t.t_stack_id); // inform ittnotify about entering user's code
6894   }
6895 #endif /* USE_ITT_BUILD */
6896 #if INCLUDE_SSC_MARKS
6897   SSC_MARK_INVOKING();
6898 #endif
6899 
6900 #if OMPT_SUPPORT
6901   void *dummy;
6902   void **exit_runtime_p;
6903   ompt_data_t *my_task_data;
6904   ompt_data_t *my_parallel_data;
6905   int ompt_team_size;
6906 
6907   if (ompt_enabled.enabled) {
6908     exit_runtime_p = &(
6909         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6910   } else {
6911     exit_runtime_p = &dummy;
6912   }
6913 
6914   my_task_data =
6915       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6916   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6917   if (ompt_enabled.ompt_callback_implicit_task) {
6918     ompt_team_size = team->t.t_nproc;
6919     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6920         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6921         __kmp_tid_from_gtid(gtid));
6922   }
6923 #endif
6924 
6925   {
6926     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6927     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6928     rc =
6929         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6930                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6931 #if OMPT_SUPPORT
6932                                ,
6933                                exit_runtime_p
6934 #endif
6935                                );
6936 #if OMPT_SUPPORT
6937     *exit_runtime_p = NULL;
6938 #endif
6939   }
6940 
6941 #if USE_ITT_BUILD
6942   if (__itt_stack_caller_create_ptr) {
6943     __kmp_itt_stack_callee_leave(
6944         (__itt_caller)
6945             team->t.t_stack_id); // inform ittnotify about leaving user's code
6946   }
6947 #endif /* USE_ITT_BUILD */
6948   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6949 
6950   return rc;
6951 }
6952 
6953 #if OMP_40_ENABLED
6954 void __kmp_teams_master(int gtid) {
6955   // This routine is called by all master threads in teams construct
6956   kmp_info_t *thr = __kmp_threads[gtid];
6957   kmp_team_t *team = thr->th.th_team;
6958   ident_t *loc = team->t.t_ident;
6959   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6960   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6961   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6962   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6963                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6964 // Launch league of teams now, but not let workers execute
6965 // (they hang on fork barrier until next parallel)
6966 #if INCLUDE_SSC_MARKS
6967   SSC_MARK_FORKING();
6968 #endif
6969   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6970                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6971                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6972 #if INCLUDE_SSC_MARKS
6973   SSC_MARK_JOINING();
6974 #endif
6975 
6976   // AC: last parameter "1" eliminates join barrier which won't work because
6977   // worker threads are in a fork barrier waiting for more parallel regions
6978   __kmp_join_call(loc, gtid
6979 #if OMPT_SUPPORT
6980                   ,
6981                   fork_context_intel
6982 #endif
6983                   ,
6984                   1);
6985 }
6986 
6987 int __kmp_invoke_teams_master(int gtid) {
6988   kmp_info_t *this_thr = __kmp_threads[gtid];
6989   kmp_team_t *team = this_thr->th.th_team;
6990 #if KMP_DEBUG
6991   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
6992     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
6993                      (void *)__kmp_teams_master);
6994 #endif
6995   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
6996   __kmp_teams_master(gtid);
6997   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
6998   return 1;
6999 }
7000 #endif /* OMP_40_ENABLED */
7001 
7002 /* this sets the requested number of threads for the next parallel region
7003    encountered by this team. since this should be enclosed in the forkjoin
7004    critical section it should avoid race conditions with assymmetrical nested
7005    parallelism */
7006 
7007 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7008   kmp_info_t *thr = __kmp_threads[gtid];
7009 
7010   if (num_threads > 0)
7011     thr->th.th_set_nproc = num_threads;
7012 }
7013 
7014 #if OMP_40_ENABLED
7015 
7016 /* this sets the requested number of teams for the teams region and/or
7017    the number of threads for the next parallel region encountered  */
7018 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7019                           int num_threads) {
7020   kmp_info_t *thr = __kmp_threads[gtid];
7021   KMP_DEBUG_ASSERT(num_teams >= 0);
7022   KMP_DEBUG_ASSERT(num_threads >= 0);
7023 
7024   if (num_teams == 0)
7025     num_teams = 1; // default number of teams is 1.
7026   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7027     if (!__kmp_reserve_warn) {
7028       __kmp_reserve_warn = 1;
7029       __kmp_msg(kmp_ms_warning,
7030                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7031                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7032     }
7033     num_teams = __kmp_teams_max_nth;
7034   }
7035   // Set number of teams (number of threads in the outer "parallel" of the
7036   // teams)
7037   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7038 
7039   // Remember the number of threads for inner parallel regions
7040   if (num_threads == 0) {
7041     if (!TCR_4(__kmp_init_middle))
7042       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7043     num_threads = __kmp_avail_proc / num_teams;
7044     if (num_teams * num_threads > __kmp_teams_max_nth) {
7045       // adjust num_threads w/o warning as it is not user setting
7046       num_threads = __kmp_teams_max_nth / num_teams;
7047     }
7048   } else {
7049     if (num_teams * num_threads > __kmp_teams_max_nth) {
7050       int new_threads = __kmp_teams_max_nth / num_teams;
7051       if (!__kmp_reserve_warn) { // user asked for too many threads
7052         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7053         __kmp_msg(kmp_ms_warning,
7054                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7055                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7056       }
7057       num_threads = new_threads;
7058     }
7059   }
7060   thr->th.th_teams_size.nth = num_threads;
7061 }
7062 
7063 // Set the proc_bind var to use in the following parallel region.
7064 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7065   kmp_info_t *thr = __kmp_threads[gtid];
7066   thr->th.th_set_proc_bind = proc_bind;
7067 }
7068 
7069 #endif /* OMP_40_ENABLED */
7070 
7071 /* Launch the worker threads into the microtask. */
7072 
7073 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7074   kmp_info_t *this_thr = __kmp_threads[gtid];
7075 
7076 #ifdef KMP_DEBUG
7077   int f;
7078 #endif /* KMP_DEBUG */
7079 
7080   KMP_DEBUG_ASSERT(team);
7081   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7082   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7083   KMP_MB(); /* Flush all pending memory write invalidates.  */
7084 
7085   team->t.t_construct = 0; /* no single directives seen yet */
7086   team->t.t_ordered.dt.t_value =
7087       0; /* thread 0 enters the ordered section first */
7088 
7089   /* Reset the identifiers on the dispatch buffer */
7090   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7091   if (team->t.t_max_nproc > 1) {
7092     int i;
7093     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7094       team->t.t_disp_buffer[i].buffer_index = i;
7095 #if OMP_45_ENABLED
7096       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7097 #endif
7098     }
7099   } else {
7100     team->t.t_disp_buffer[0].buffer_index = 0;
7101 #if OMP_45_ENABLED
7102     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7103 #endif
7104   }
7105 
7106   KMP_MB(); /* Flush all pending memory write invalidates.  */
7107   KMP_ASSERT(this_thr->th.th_team == team);
7108 
7109 #ifdef KMP_DEBUG
7110   for (f = 0; f < team->t.t_nproc; f++) {
7111     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7112                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7113   }
7114 #endif /* KMP_DEBUG */
7115 
7116   /* release the worker threads so they may begin working */
7117   __kmp_fork_barrier(gtid, 0);
7118 }
7119 
7120 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7121   kmp_info_t *this_thr = __kmp_threads[gtid];
7122 
7123   KMP_DEBUG_ASSERT(team);
7124   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7125   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7126   KMP_MB(); /* Flush all pending memory write invalidates.  */
7127 
7128 /* Join barrier after fork */
7129 
7130 #ifdef KMP_DEBUG
7131   if (__kmp_threads[gtid] &&
7132       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7133     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7134                  __kmp_threads[gtid]);
7135     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7136                  "team->t.t_nproc=%d\n",
7137                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7138                  team->t.t_nproc);
7139     __kmp_print_structure();
7140   }
7141   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7142                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7143 #endif /* KMP_DEBUG */
7144 
7145   __kmp_join_barrier(gtid); /* wait for everyone */
7146 #if OMPT_SUPPORT
7147   if (ompt_enabled.enabled &&
7148       this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7149     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7150     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7151     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7152 #if OMPT_OPTIONAL
7153     void *codeptr = NULL;
7154     if (KMP_MASTER_TID(ds_tid) &&
7155         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7156          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7157       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7158 
7159     if (ompt_enabled.ompt_callback_sync_region_wait) {
7160       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7161           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7162     }
7163     if (ompt_enabled.ompt_callback_sync_region) {
7164       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7165           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7166     }
7167 #endif
7168     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7169       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7170           ompt_scope_end, NULL, task_data, 0, ds_tid);
7171     }
7172   }
7173 #endif
7174 
7175   KMP_MB(); /* Flush all pending memory write invalidates.  */
7176   KMP_ASSERT(this_thr->th.th_team == team);
7177 }
7178 
7179 /* ------------------------------------------------------------------------ */
7180 
7181 #ifdef USE_LOAD_BALANCE
7182 
7183 // Return the worker threads actively spinning in the hot team, if we
7184 // are at the outermost level of parallelism.  Otherwise, return 0.
7185 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7186   int i;
7187   int retval;
7188   kmp_team_t *hot_team;
7189 
7190   if (root->r.r_active) {
7191     return 0;
7192   }
7193   hot_team = root->r.r_hot_team;
7194   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7195     return hot_team->t.t_nproc - 1; // Don't count master thread
7196   }
7197 
7198   // Skip the master thread - it is accounted for elsewhere.
7199   retval = 0;
7200   for (i = 1; i < hot_team->t.t_nproc; i++) {
7201     if (hot_team->t.t_threads[i]->th.th_active) {
7202       retval++;
7203     }
7204   }
7205   return retval;
7206 }
7207 
7208 // Perform an automatic adjustment to the number of
7209 // threads used by the next parallel region.
7210 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7211   int retval;
7212   int pool_active;
7213   int hot_team_active;
7214   int team_curr_active;
7215   int system_active;
7216 
7217   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7218                 set_nproc));
7219   KMP_DEBUG_ASSERT(root);
7220   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7221                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7222   KMP_DEBUG_ASSERT(set_nproc > 1);
7223 
7224   if (set_nproc == 1) {
7225     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7226     return 1;
7227   }
7228 
7229   // Threads that are active in the thread pool, active in the hot team for this
7230   // particular root (if we are at the outer par level), and the currently
7231   // executing thread (to become the master) are available to add to the new
7232   // team, but are currently contributing to the system load, and must be
7233   // accounted for.
7234   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7235   hot_team_active = __kmp_active_hot_team_nproc(root);
7236   team_curr_active = pool_active + hot_team_active + 1;
7237 
7238   // Check the system load.
7239   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7240   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7241                 "hot team active = %d\n",
7242                 system_active, pool_active, hot_team_active));
7243 
7244   if (system_active < 0) {
7245     // There was an error reading the necessary info from /proc, so use the
7246     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7247     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7248     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7249     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7250 
7251     // Make this call behave like the thread limit algorithm.
7252     retval = __kmp_avail_proc - __kmp_nth +
7253              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7254     if (retval > set_nproc) {
7255       retval = set_nproc;
7256     }
7257     if (retval < KMP_MIN_NTH) {
7258       retval = KMP_MIN_NTH;
7259     }
7260 
7261     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7262                   retval));
7263     return retval;
7264   }
7265 
7266   // There is a slight delay in the load balance algorithm in detecting new
7267   // running procs. The real system load at this instant should be at least as
7268   // large as the #active omp thread that are available to add to the team.
7269   if (system_active < team_curr_active) {
7270     system_active = team_curr_active;
7271   }
7272   retval = __kmp_avail_proc - system_active + team_curr_active;
7273   if (retval > set_nproc) {
7274     retval = set_nproc;
7275   }
7276   if (retval < KMP_MIN_NTH) {
7277     retval = KMP_MIN_NTH;
7278   }
7279 
7280   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7281   return retval;
7282 } // __kmp_load_balance_nproc()
7283 
7284 #endif /* USE_LOAD_BALANCE */
7285 
7286 /* ------------------------------------------------------------------------ */
7287 
7288 /* NOTE: this is called with the __kmp_init_lock held */
7289 void __kmp_cleanup(void) {
7290   int f;
7291 
7292   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7293 
7294   if (TCR_4(__kmp_init_parallel)) {
7295 #if KMP_HANDLE_SIGNALS
7296     __kmp_remove_signals();
7297 #endif
7298     TCW_4(__kmp_init_parallel, FALSE);
7299   }
7300 
7301   if (TCR_4(__kmp_init_middle)) {
7302 #if KMP_AFFINITY_SUPPORTED
7303     __kmp_affinity_uninitialize();
7304 #endif /* KMP_AFFINITY_SUPPORTED */
7305     __kmp_cleanup_hierarchy();
7306     TCW_4(__kmp_init_middle, FALSE);
7307   }
7308 
7309   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7310 
7311   if (__kmp_init_serial) {
7312     __kmp_runtime_destroy();
7313     __kmp_init_serial = FALSE;
7314   }
7315 
7316   __kmp_cleanup_threadprivate_caches();
7317 
7318   for (f = 0; f < __kmp_threads_capacity; f++) {
7319     if (__kmp_root[f] != NULL) {
7320       __kmp_free(__kmp_root[f]);
7321       __kmp_root[f] = NULL;
7322     }
7323   }
7324   __kmp_free(__kmp_threads);
7325   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7326   // there is no need in freeing __kmp_root.
7327   __kmp_threads = NULL;
7328   __kmp_root = NULL;
7329   __kmp_threads_capacity = 0;
7330 
7331 #if KMP_USE_DYNAMIC_LOCK
7332   __kmp_cleanup_indirect_user_locks();
7333 #else
7334   __kmp_cleanup_user_locks();
7335 #endif
7336 
7337 #if KMP_AFFINITY_SUPPORTED
7338   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7339   __kmp_cpuinfo_file = NULL;
7340 #endif /* KMP_AFFINITY_SUPPORTED */
7341 
7342 #if KMP_USE_ADAPTIVE_LOCKS
7343 #if KMP_DEBUG_ADAPTIVE_LOCKS
7344   __kmp_print_speculative_stats();
7345 #endif
7346 #endif
7347   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7348   __kmp_nested_nth.nth = NULL;
7349   __kmp_nested_nth.size = 0;
7350   __kmp_nested_nth.used = 0;
7351   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7352   __kmp_nested_proc_bind.bind_types = NULL;
7353   __kmp_nested_proc_bind.size = 0;
7354   __kmp_nested_proc_bind.used = 0;
7355 
7356   __kmp_i18n_catclose();
7357 
7358 #if KMP_STATS_ENABLED
7359   __kmp_stats_fini();
7360 #endif
7361 
7362   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7363 }
7364 
7365 /* ------------------------------------------------------------------------ */
7366 
7367 int __kmp_ignore_mppbeg(void) {
7368   char *env;
7369 
7370   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7371     if (__kmp_str_match_false(env))
7372       return FALSE;
7373   }
7374   // By default __kmpc_begin() is no-op.
7375   return TRUE;
7376 }
7377 
7378 int __kmp_ignore_mppend(void) {
7379   char *env;
7380 
7381   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7382     if (__kmp_str_match_false(env))
7383       return FALSE;
7384   }
7385   // By default __kmpc_end() is no-op.
7386   return TRUE;
7387 }
7388 
7389 void __kmp_internal_begin(void) {
7390   int gtid;
7391   kmp_root_t *root;
7392 
7393   /* this is a very important step as it will register new sibling threads
7394      and assign these new uber threads a new gtid */
7395   gtid = __kmp_entry_gtid();
7396   root = __kmp_threads[gtid]->th.th_root;
7397   KMP_ASSERT(KMP_UBER_GTID(gtid));
7398 
7399   if (root->r.r_begin)
7400     return;
7401   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7402   if (root->r.r_begin) {
7403     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7404     return;
7405   }
7406 
7407   root->r.r_begin = TRUE;
7408 
7409   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7410 }
7411 
7412 /* ------------------------------------------------------------------------ */
7413 
7414 void __kmp_user_set_library(enum library_type arg) {
7415   int gtid;
7416   kmp_root_t *root;
7417   kmp_info_t *thread;
7418 
7419   /* first, make sure we are initialized so we can get our gtid */
7420 
7421   gtid = __kmp_entry_gtid();
7422   thread = __kmp_threads[gtid];
7423 
7424   root = thread->th.th_root;
7425 
7426   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7427                 library_serial));
7428   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7429                                   thread */
7430     KMP_WARNING(SetLibraryIncorrectCall);
7431     return;
7432   }
7433 
7434   switch (arg) {
7435   case library_serial:
7436     thread->th.th_set_nproc = 0;
7437     set__nproc(thread, 1);
7438     break;
7439   case library_turnaround:
7440     thread->th.th_set_nproc = 0;
7441     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7442                                            : __kmp_dflt_team_nth_ub);
7443     break;
7444   case library_throughput:
7445     thread->th.th_set_nproc = 0;
7446     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7447                                            : __kmp_dflt_team_nth_ub);
7448     break;
7449   default:
7450     KMP_FATAL(UnknownLibraryType, arg);
7451   }
7452 
7453   __kmp_aux_set_library(arg);
7454 }
7455 
7456 void __kmp_aux_set_stacksize(size_t arg) {
7457   if (!__kmp_init_serial)
7458     __kmp_serial_initialize();
7459 
7460 #if KMP_OS_DARWIN
7461   if (arg & (0x1000 - 1)) {
7462     arg &= ~(0x1000 - 1);
7463     if (arg + 0x1000) /* check for overflow if we round up */
7464       arg += 0x1000;
7465   }
7466 #endif
7467   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7468 
7469   /* only change the default stacksize before the first parallel region */
7470   if (!TCR_4(__kmp_init_parallel)) {
7471     size_t value = arg; /* argument is in bytes */
7472 
7473     if (value < __kmp_sys_min_stksize)
7474       value = __kmp_sys_min_stksize;
7475     else if (value > KMP_MAX_STKSIZE)
7476       value = KMP_MAX_STKSIZE;
7477 
7478     __kmp_stksize = value;
7479 
7480     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7481   }
7482 
7483   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7484 }
7485 
7486 /* set the behaviour of the runtime library */
7487 /* TODO this can cause some odd behaviour with sibling parallelism... */
7488 void __kmp_aux_set_library(enum library_type arg) {
7489   __kmp_library = arg;
7490 
7491   switch (__kmp_library) {
7492   case library_serial: {
7493     KMP_INFORM(LibraryIsSerial);
7494     (void)__kmp_change_library(TRUE);
7495   } break;
7496   case library_turnaround:
7497     (void)__kmp_change_library(TRUE);
7498     break;
7499   case library_throughput:
7500     (void)__kmp_change_library(FALSE);
7501     break;
7502   default:
7503     KMP_FATAL(UnknownLibraryType, arg);
7504   }
7505 }
7506 
7507 /* ------------------------------------------------------------------------ */
7508 
7509 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7510   int blocktime = arg; /* argument is in milliseconds */
7511 #if KMP_USE_MONITOR
7512   int bt_intervals;
7513 #endif
7514   int bt_set;
7515 
7516   __kmp_save_internal_controls(thread);
7517 
7518   /* Normalize and set blocktime for the teams */
7519   if (blocktime < KMP_MIN_BLOCKTIME)
7520     blocktime = KMP_MIN_BLOCKTIME;
7521   else if (blocktime > KMP_MAX_BLOCKTIME)
7522     blocktime = KMP_MAX_BLOCKTIME;
7523 
7524   set__blocktime_team(thread->th.th_team, tid, blocktime);
7525   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7526 
7527 #if KMP_USE_MONITOR
7528   /* Calculate and set blocktime intervals for the teams */
7529   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7530 
7531   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7532   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7533 #endif
7534 
7535   /* Set whether blocktime has been set to "TRUE" */
7536   bt_set = TRUE;
7537 
7538   set__bt_set_team(thread->th.th_team, tid, bt_set);
7539   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7540 #if KMP_USE_MONITOR
7541   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7542                 "bt_intervals=%d, monitor_updates=%d\n",
7543                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7544                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7545                 __kmp_monitor_wakeups));
7546 #else
7547   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7548                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7549                 thread->th.th_team->t.t_id, tid, blocktime));
7550 #endif
7551 }
7552 
7553 void __kmp_aux_set_defaults(char const *str, int len) {
7554   if (!__kmp_init_serial) {
7555     __kmp_serial_initialize();
7556   }
7557   __kmp_env_initialize(str);
7558 
7559   if (__kmp_settings
7560 #if OMP_40_ENABLED
7561       || __kmp_display_env || __kmp_display_env_verbose
7562 #endif // OMP_40_ENABLED
7563       ) {
7564     __kmp_env_print();
7565   }
7566 } // __kmp_aux_set_defaults
7567 
7568 /* ------------------------------------------------------------------------ */
7569 /* internal fast reduction routines */
7570 
7571 PACKED_REDUCTION_METHOD_T
7572 __kmp_determine_reduction_method(
7573     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7574     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7575     kmp_critical_name *lck) {
7576 
7577   // Default reduction method: critical construct ( lck != NULL, like in current
7578   // PAROPT )
7579   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7580   // can be selected by RTL
7581   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7582   // can be selected by RTL
7583   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7584   // among generated by PAROPT.
7585 
7586   PACKED_REDUCTION_METHOD_T retval;
7587 
7588   int team_size;
7589 
7590   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7591   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7592 
7593 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7594   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7595 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7596 
7597   retval = critical_reduce_block;
7598 
7599   // another choice of getting a team size (with 1 dynamic deference) is slower
7600   team_size = __kmp_get_team_num_threads(global_tid);
7601   if (team_size == 1) {
7602 
7603     retval = empty_reduce_block;
7604 
7605   } else {
7606 
7607     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7608     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7609 
7610 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7611 
7612 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7613     KMP_OS_DARWIN
7614 
7615     int teamsize_cutoff = 4;
7616 
7617 #if KMP_MIC_SUPPORTED
7618     if (__kmp_mic_type != non_mic) {
7619       teamsize_cutoff = 8;
7620     }
7621 #endif
7622     if (tree_available) {
7623       if (team_size <= teamsize_cutoff) {
7624         if (atomic_available) {
7625           retval = atomic_reduce_block;
7626         }
7627       } else {
7628         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7629       }
7630     } else if (atomic_available) {
7631       retval = atomic_reduce_block;
7632     }
7633 #else
7634 #error "Unknown or unsupported OS"
7635 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7636 // KMP_OS_DARWIN
7637 
7638 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7639 
7640 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7641 
7642     // basic tuning
7643 
7644     if (atomic_available) {
7645       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7646         retval = atomic_reduce_block;
7647       }
7648     } // otherwise: use critical section
7649 
7650 #elif KMP_OS_DARWIN
7651 
7652     if (atomic_available && (num_vars <= 3)) {
7653       retval = atomic_reduce_block;
7654     } else if (tree_available) {
7655       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7656           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7657         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7658       }
7659     } // otherwise: use critical section
7660 
7661 #else
7662 #error "Unknown or unsupported OS"
7663 #endif
7664 
7665 #else
7666 #error "Unknown or unsupported architecture"
7667 #endif
7668   }
7669 
7670   // KMP_FORCE_REDUCTION
7671 
7672   // If the team is serialized (team_size == 1), ignore the forced reduction
7673   // method and stay with the unsynchronized method (empty_reduce_block)
7674   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7675       team_size != 1) {
7676 
7677     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7678 
7679     int atomic_available, tree_available;
7680 
7681     switch ((forced_retval = __kmp_force_reduction_method)) {
7682     case critical_reduce_block:
7683       KMP_ASSERT(lck); // lck should be != 0
7684       break;
7685 
7686     case atomic_reduce_block:
7687       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7688       if (!atomic_available) {
7689         KMP_WARNING(RedMethodNotSupported, "atomic");
7690         forced_retval = critical_reduce_block;
7691       }
7692       break;
7693 
7694     case tree_reduce_block:
7695       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7696       if (!tree_available) {
7697         KMP_WARNING(RedMethodNotSupported, "tree");
7698         forced_retval = critical_reduce_block;
7699       } else {
7700 #if KMP_FAST_REDUCTION_BARRIER
7701         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7702 #endif
7703       }
7704       break;
7705 
7706     default:
7707       KMP_ASSERT(0); // "unsupported method specified"
7708     }
7709 
7710     retval = forced_retval;
7711   }
7712 
7713   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7714 
7715 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7716 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7717 
7718   return (retval);
7719 }
7720 
7721 // this function is for testing set/get/determine reduce method
7722 kmp_int32 __kmp_get_reduce_method(void) {
7723   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7724 }
7725