1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nWish, int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725 #if OMPT_SUPPORT && OMPT_BLAME
726     if (ompt_enabled &&
727         ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
728       /* accept blame for "ordered" waiting */
729       kmp_info_t *this_thread = __kmp_threads[gtid];
730       ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
731           this_thread->th.ompt_thread_info.wait_id);
732     }
733 #endif
734 
735     KMP_MB(); /* Flush all pending memory write invalidates.  */
736   }
737 #endif /* BUILD_PARALLEL_ORDERED */
738 }
739 
740 /* ------------------------------------------------------------------------ */
741 /* The BARRIER for a SINGLE process section is always explicit   */
742 
743 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
744   int status;
745   kmp_info_t *th;
746   kmp_team_t *team;
747 
748   if (!TCR_4(__kmp_init_parallel))
749     __kmp_parallel_initialize();
750 
751   th = __kmp_threads[gtid];
752   team = th->th.th_team;
753   status = 0;
754 
755   th->th.th_ident = id_ref;
756 
757   if (team->t.t_serialized) {
758     status = 1;
759   } else {
760     kmp_int32 old_this = th->th.th_local.this_construct;
761 
762     ++th->th.th_local.this_construct;
763     /* try to set team count to thread count--success means thread got the
764        single block */
765     /* TODO: Should this be acquire or release? */
766     if (team->t.t_construct == old_this) {
767       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
768                                            th->th.th_local.this_construct);
769     }
770 #if USE_ITT_BUILD
771     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
772         KMP_MASTER_GTID(gtid) &&
773 #if OMP_40_ENABLED
774         th->th.th_teams_microtask == NULL &&
775 #endif
776         team->t.t_active_level ==
777             1) { // Only report metadata by master of active team at level 1
778       __kmp_itt_metadata_single(id_ref);
779     }
780 #endif /* USE_ITT_BUILD */
781   }
782 
783   if (__kmp_env_consistency_check) {
784     if (status && push_ws) {
785       __kmp_push_workshare(gtid, ct_psingle, id_ref);
786     } else {
787       __kmp_check_workshare(gtid, ct_psingle, id_ref);
788     }
789   }
790 #if USE_ITT_BUILD
791   if (status) {
792     __kmp_itt_single_start(gtid);
793   }
794 #endif /* USE_ITT_BUILD */
795   return status;
796 }
797 
798 void __kmp_exit_single(int gtid) {
799 #if USE_ITT_BUILD
800   __kmp_itt_single_end(gtid);
801 #endif /* USE_ITT_BUILD */
802   if (__kmp_env_consistency_check)
803     __kmp_pop_workshare(gtid, ct_psingle, NULL);
804 }
805 
806 /* determine if we can go parallel or must use a serialized parallel region and
807  * how many threads we can use
808  * set_nproc is the number of threads requested for the team
809  * returns 0 if we should serialize or only use one thread,
810  * otherwise the number of threads to use
811  * The forkjoin lock is held by the caller. */
812 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
813                                  int master_tid, int set_nthreads
814 #if OMP_40_ENABLED
815                                  ,
816                                  int enter_teams
817 #endif /* OMP_40_ENABLED */
818                                  ) {
819   int capacity;
820   int new_nthreads;
821   KMP_DEBUG_ASSERT(__kmp_init_serial);
822   KMP_DEBUG_ASSERT(root && parent_team);
823 
824   // If dyn-var is set, dynamically adjust the number of desired threads,
825   // according to the method specified by dynamic_mode.
826   new_nthreads = set_nthreads;
827   if (!get__dynamic_2(parent_team, master_tid)) {
828     ;
829   }
830 #ifdef USE_LOAD_BALANCE
831   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
832     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
833     if (new_nthreads == 1) {
834       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
835                     "reservation to 1 thread\n",
836                     master_tid));
837       return 1;
838     }
839     if (new_nthreads < set_nthreads) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
841                     "reservation to %d threads\n",
842                     master_tid, new_nthreads));
843     }
844   }
845 #endif /* USE_LOAD_BALANCE */
846   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
847     new_nthreads = __kmp_avail_proc - __kmp_nth +
848                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
849     if (new_nthreads <= 1) {
850       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
851                     "reservation to 1 thread\n",
852                     master_tid));
853       return 1;
854     }
855     if (new_nthreads < set_nthreads) {
856       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
857                     "reservation to %d threads\n",
858                     master_tid, new_nthreads));
859     } else {
860       new_nthreads = set_nthreads;
861     }
862   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
863     if (set_nthreads > 2) {
864       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
865       new_nthreads = (new_nthreads % set_nthreads) + 1;
866       if (new_nthreads == 1) {
867         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
868                       "reservation to 1 thread\n",
869                       master_tid));
870         return 1;
871       }
872       if (new_nthreads < set_nthreads) {
873         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
874                       "reservation to %d threads\n",
875                       master_tid, new_nthreads));
876       }
877     }
878   } else {
879     KMP_ASSERT(0);
880   }
881 
882   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
883   if (__kmp_nth + new_nthreads -
884           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885       __kmp_max_nth) {
886     int tl_nthreads = __kmp_max_nth - __kmp_nth +
887                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888     if (tl_nthreads <= 0) {
889       tl_nthreads = 1;
890     }
891 
892     // If dyn-var is false, emit a 1-time warning.
893     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894       __kmp_reserve_warn = 1;
895       __kmp_msg(kmp_ms_warning,
896                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898     }
899     if (tl_nthreads == 1) {
900       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
901                     "reduced reservation to 1 thread\n",
902                     master_tid));
903       return 1;
904     }
905     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
906                   "reservation to %d threads\n",
907                   master_tid, tl_nthreads));
908     new_nthreads = tl_nthreads;
909   }
910 
911   // Respect OMP_THREAD_LIMIT
912   if (root->r.r_cg_nthreads + new_nthreads -
913           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
914       __kmp_cg_max_nth) {
915     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
916                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
917     if (tl_nthreads <= 0) {
918       tl_nthreads = 1;
919     }
920 
921     // If dyn-var is false, emit a 1-time warning.
922     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
923       __kmp_reserve_warn = 1;
924       __kmp_msg(kmp_ms_warning,
925                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
926                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
927     }
928     if (tl_nthreads == 1) {
929       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
930                     "reduced reservation to 1 thread\n",
931                     master_tid));
932       return 1;
933     }
934     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
935                   "reservation to %d threads\n",
936                   master_tid, tl_nthreads));
937     new_nthreads = tl_nthreads;
938   }
939 
940   // Check if the threads array is large enough, or needs expanding.
941   // See comment in __kmp_register_root() about the adjustment if
942   // __kmp_threads[0] == NULL.
943   capacity = __kmp_threads_capacity;
944   if (TCR_PTR(__kmp_threads[0]) == NULL) {
945     --capacity;
946   }
947   if (__kmp_nth + new_nthreads -
948           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
949       capacity) {
950     // Expand the threads array.
951     int slotsRequired = __kmp_nth + new_nthreads -
952                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
953                         capacity;
954     int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
955     if (slotsAdded < slotsRequired) {
956       // The threads array was not expanded enough.
957       new_nthreads -= (slotsRequired - slotsAdded);
958       KMP_ASSERT(new_nthreads >= 1);
959 
960       // If dyn-var is false, emit a 1-time warning.
961       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
962         __kmp_reserve_warn = 1;
963         if (__kmp_tp_cached) {
964           __kmp_msg(kmp_ms_warning,
965                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
966                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
967                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
968         } else {
969           __kmp_msg(kmp_ms_warning,
970                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
971                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
972         }
973       }
974     }
975   }
976 
977 #ifdef KMP_DEBUG
978   if (new_nthreads == 1) {
979     KC_TRACE(10,
980              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
981               "dead roots and rechecking; requested %d threads\n",
982               __kmp_get_gtid(), set_nthreads));
983   } else {
984     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
985                   " %d threads\n",
986                   __kmp_get_gtid(), new_nthreads, set_nthreads));
987   }
988 #endif // KMP_DEBUG
989   return new_nthreads;
990 }
991 
992 /* Allocate threads from the thread pool and assign them to the new team. We are
993    assured that there are enough threads available, because we checked on that
994    earlier within critical section forkjoin */
995 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
996                                     kmp_info_t *master_th, int master_gtid) {
997   int i;
998   int use_hot_team;
999 
1000   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1001   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1002   KMP_MB();
1003 
1004   /* first, let's setup the master thread */
1005   master_th->th.th_info.ds.ds_tid = 0;
1006   master_th->th.th_team = team;
1007   master_th->th.th_team_nproc = team->t.t_nproc;
1008   master_th->th.th_team_master = master_th;
1009   master_th->th.th_team_serialized = FALSE;
1010   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1011 
1012 /* make sure we are not the optimized hot team */
1013 #if KMP_NESTED_HOT_TEAMS
1014   use_hot_team = 0;
1015   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1016   if (hot_teams) { // hot teams array is not allocated if
1017     // KMP_HOT_TEAMS_MAX_LEVEL=0
1018     int level = team->t.t_active_level - 1; // index in array of hot teams
1019     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1020       if (master_th->th.th_teams_size.nteams > 1) {
1021         ++level; // level was not increased in teams construct for
1022         // team_of_masters
1023       }
1024       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1025           master_th->th.th_teams_level == team->t.t_level) {
1026         ++level; // level was not increased in teams construct for
1027         // team_of_workers before the parallel
1028       } // team->t.t_level will be increased inside parallel
1029     }
1030     if (level < __kmp_hot_teams_max_level) {
1031       if (hot_teams[level].hot_team) {
1032         // hot team has already been allocated for given level
1033         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1034         use_hot_team = 1; // the team is ready to use
1035       } else {
1036         use_hot_team = 0; // AC: threads are not allocated yet
1037         hot_teams[level].hot_team = team; // remember new hot team
1038         hot_teams[level].hot_team_nth = team->t.t_nproc;
1039       }
1040     } else {
1041       use_hot_team = 0;
1042     }
1043   }
1044 #else
1045   use_hot_team = team == root->r.r_hot_team;
1046 #endif
1047   if (!use_hot_team) {
1048 
1049     /* install the master thread */
1050     team->t.t_threads[0] = master_th;
1051     __kmp_initialize_info(master_th, team, 0, master_gtid);
1052 
1053     /* now, install the worker threads */
1054     for (i = 1; i < team->t.t_nproc; i++) {
1055 
1056       /* fork or reallocate a new thread and install it in team */
1057       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1058       team->t.t_threads[i] = thr;
1059       KMP_DEBUG_ASSERT(thr);
1060       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1061       /* align team and thread arrived states */
1062       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1063                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1064                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1065                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1066                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1067                     team->t.t_bar[bs_plain_barrier].b_arrived));
1068 #if OMP_40_ENABLED
1069       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1070       thr->th.th_teams_level = master_th->th.th_teams_level;
1071       thr->th.th_teams_size = master_th->th.th_teams_size;
1072 #endif
1073       { // Initialize threads' barrier data.
1074         int b;
1075         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1076         for (b = 0; b < bs_last_barrier; ++b) {
1077           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1078           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1079 #if USE_DEBUGGER
1080           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1081 #endif
1082         }
1083       }
1084     }
1085 
1086 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1087     __kmp_partition_places(team);
1088 #endif
1089   }
1090 
1091   KMP_MB();
1092 }
1093 
1094 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1095 // Propagate any changes to the floating point control registers out to the team
1096 // We try to avoid unnecessary writes to the relevant cache line in the team
1097 // structure, so we don't make changes unless they are needed.
1098 inline static void propagateFPControl(kmp_team_t *team) {
1099   if (__kmp_inherit_fp_control) {
1100     kmp_int16 x87_fpu_control_word;
1101     kmp_uint32 mxcsr;
1102 
1103     // Get master values of FPU control flags (both X87 and vector)
1104     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1105     __kmp_store_mxcsr(&mxcsr);
1106     mxcsr &= KMP_X86_MXCSR_MASK;
1107 
1108     // There is no point looking at t_fp_control_saved here.
1109     // If it is TRUE, we still have to update the values if they are different
1110     // from those we now have. If it is FALSE we didn't save anything yet, but
1111     // our objective is the same. We have to ensure that the values in the team
1112     // are the same as those we have.
1113     // So, this code achieves what we need whether or not t_fp_control_saved is
1114     // true. By checking whether the value needs updating we avoid unnecessary
1115     // writes that would put the cache-line into a written state, causing all
1116     // threads in the team to have to read it again.
1117     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1118     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1119     // Although we don't use this value, other code in the runtime wants to know
1120     // whether it should restore them. So we must ensure it is correct.
1121     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1122   } else {
1123     // Similarly here. Don't write to this cache-line in the team structure
1124     // unless we have to.
1125     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1126   }
1127 }
1128 
1129 // Do the opposite, setting the hardware registers to the updated values from
1130 // the team.
1131 inline static void updateHWFPControl(kmp_team_t *team) {
1132   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1133     // Only reset the fp control regs if they have been changed in the team.
1134     // the parallel region that we are exiting.
1135     kmp_int16 x87_fpu_control_word;
1136     kmp_uint32 mxcsr;
1137     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1138     __kmp_store_mxcsr(&mxcsr);
1139     mxcsr &= KMP_X86_MXCSR_MASK;
1140 
1141     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1142       __kmp_clear_x87_fpu_status_word();
1143       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1144     }
1145 
1146     if (team->t.t_mxcsr != mxcsr) {
1147       __kmp_load_mxcsr(&team->t.t_mxcsr);
1148     }
1149   }
1150 }
1151 #else
1152 #define propagateFPControl(x) ((void)0)
1153 #define updateHWFPControl(x) ((void)0)
1154 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1155 
1156 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1157                                      int realloc); // forward declaration
1158 
1159 /* Run a parallel region that has been serialized, so runs only in a team of the
1160    single master thread. */
1161 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1162   kmp_info_t *this_thr;
1163   kmp_team_t *serial_team;
1164 
1165   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1166 
1167   /* Skip all this code for autopar serialized loops since it results in
1168      unacceptable overhead */
1169   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1170     return;
1171 
1172   if (!TCR_4(__kmp_init_parallel))
1173     __kmp_parallel_initialize();
1174 
1175   this_thr = __kmp_threads[global_tid];
1176   serial_team = this_thr->th.th_serial_team;
1177 
1178   /* utilize the serialized team held by this thread */
1179   KMP_DEBUG_ASSERT(serial_team);
1180   KMP_MB();
1181 
1182   if (__kmp_tasking_mode != tskm_immediate_exec) {
1183     KMP_DEBUG_ASSERT(
1184         this_thr->th.th_task_team ==
1185         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1186     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1187                      NULL);
1188     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1189                   "team %p, new task_team = NULL\n",
1190                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1191     this_thr->th.th_task_team = NULL;
1192   }
1193 
1194 #if OMP_40_ENABLED
1195   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1196   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1197     proc_bind = proc_bind_false;
1198   } else if (proc_bind == proc_bind_default) {
1199     // No proc_bind clause was specified, so use the current value
1200     // of proc-bind-var for this parallel region.
1201     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1202   }
1203   // Reset for next parallel region
1204   this_thr->th.th_set_proc_bind = proc_bind_default;
1205 #endif /* OMP_40_ENABLED */
1206 
1207   if (this_thr->th.th_team != serial_team) {
1208     // Nested level will be an index in the nested nthreads array
1209     int level = this_thr->th.th_team->t.t_level;
1210 
1211     if (serial_team->t.t_serialized) {
1212       /* this serial team was already used
1213          TODO increase performance by making this locks more specific */
1214       kmp_team_t *new_team;
1215 
1216       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1217 
1218 #if OMPT_SUPPORT
1219       ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1220 #endif
1221 
1222       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1223 #if OMPT_SUPPORT
1224                                      ompt_parallel_id,
1225 #endif
1226 #if OMP_40_ENABLED
1227                                      proc_bind,
1228 #endif
1229                                      &this_thr->th.th_current_task->td_icvs,
1230                                      0 USE_NESTED_HOT_ARG(NULL));
1231       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1232       KMP_ASSERT(new_team);
1233 
1234       /* setup new serialized team and install it */
1235       new_team->t.t_threads[0] = this_thr;
1236       new_team->t.t_parent = this_thr->th.th_team;
1237       serial_team = new_team;
1238       this_thr->th.th_serial_team = serial_team;
1239 
1240       KF_TRACE(
1241           10,
1242           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1243            global_tid, serial_team));
1244 
1245       /* TODO the above breaks the requirement that if we run out of resources,
1246          then we can still guarantee that serialized teams are ok, since we may
1247          need to allocate a new one */
1248     } else {
1249       KF_TRACE(
1250           10,
1251           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1252            global_tid, serial_team));
1253     }
1254 
1255     /* we have to initialize this serial team */
1256     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1257     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1258     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1259     serial_team->t.t_ident = loc;
1260     serial_team->t.t_serialized = 1;
1261     serial_team->t.t_nproc = 1;
1262     serial_team->t.t_parent = this_thr->th.th_team;
1263     serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1264     this_thr->th.th_team = serial_team;
1265     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1266 
1267     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1268                   this_thr->th.th_current_task));
1269     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1270     this_thr->th.th_current_task->td_flags.executing = 0;
1271 
1272     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1273 
1274     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1275        implicit task for each serialized task represented by
1276        team->t.t_serialized? */
1277     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1278               &this_thr->th.th_current_task->td_parent->td_icvs);
1279 
1280     // Thread value exists in the nested nthreads array for the next nested
1281     // level
1282     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1283       this_thr->th.th_current_task->td_icvs.nproc =
1284           __kmp_nested_nth.nth[level + 1];
1285     }
1286 
1287 #if OMP_40_ENABLED
1288     if (__kmp_nested_proc_bind.used &&
1289         (level + 1 < __kmp_nested_proc_bind.used)) {
1290       this_thr->th.th_current_task->td_icvs.proc_bind =
1291           __kmp_nested_proc_bind.bind_types[level + 1];
1292     }
1293 #endif /* OMP_40_ENABLED */
1294 
1295 #if USE_DEBUGGER
1296     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1297 #endif
1298     this_thr->th.th_info.ds.ds_tid = 0;
1299 
1300     /* set thread cache values */
1301     this_thr->th.th_team_nproc = 1;
1302     this_thr->th.th_team_master = this_thr;
1303     this_thr->th.th_team_serialized = 1;
1304 
1305     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1306     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1307 
1308     propagateFPControl(serial_team);
1309 
1310     /* check if we need to allocate dispatch buffers stack */
1311     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1312     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1313       serial_team->t.t_dispatch->th_disp_buffer =
1314           (dispatch_private_info_t *)__kmp_allocate(
1315               sizeof(dispatch_private_info_t));
1316     }
1317     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1318 
1319 #if OMPT_SUPPORT
1320     ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1321     __ompt_team_assign_id(serial_team, ompt_parallel_id);
1322 #endif
1323 
1324     KMP_MB();
1325 
1326   } else {
1327     /* this serialized team is already being used,
1328      * that's fine, just add another nested level */
1329     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1330     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1331     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1332     ++serial_team->t.t_serialized;
1333     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1334 
1335     // Nested level will be an index in the nested nthreads array
1336     int level = this_thr->th.th_team->t.t_level;
1337     // Thread value exists in the nested nthreads array for the next nested
1338     // level
1339     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1340       this_thr->th.th_current_task->td_icvs.nproc =
1341           __kmp_nested_nth.nth[level + 1];
1342     }
1343     serial_team->t.t_level++;
1344     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1345                   "of serial team %p to %d\n",
1346                   global_tid, serial_team, serial_team->t.t_level));
1347 
1348     /* allocate/push dispatch buffers stack */
1349     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350     {
1351       dispatch_private_info_t *disp_buffer =
1352           (dispatch_private_info_t *)__kmp_allocate(
1353               sizeof(dispatch_private_info_t));
1354       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1355       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1356     }
1357     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1358 
1359     KMP_MB();
1360   }
1361 #if OMP_40_ENABLED
1362   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1363 #endif
1364 
1365   if (__kmp_env_consistency_check)
1366     __kmp_push_parallel(global_tid, NULL);
1367 }
1368 
1369 /* most of the work for a fork */
1370 /* return true if we really went parallel, false if serialized */
1371 int __kmp_fork_call(ident_t *loc, int gtid,
1372                     enum fork_context_e call_context, // Intel, GNU, ...
1373                     kmp_int32 argc,
1374 #if OMPT_SUPPORT
1375                     void *unwrapped_task,
1376 #endif
1377                     microtask_t microtask, launch_t invoker,
1378 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1379 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1380                     va_list *ap
1381 #else
1382                     va_list ap
1383 #endif
1384                     ) {
1385   void **argv;
1386   int i;
1387   int master_tid;
1388   int master_this_cons;
1389   kmp_team_t *team;
1390   kmp_team_t *parent_team;
1391   kmp_info_t *master_th;
1392   kmp_root_t *root;
1393   int nthreads;
1394   int master_active;
1395   int master_set_numthreads;
1396   int level;
1397 #if OMP_40_ENABLED
1398   int active_level;
1399   int teams_level;
1400 #endif
1401 #if KMP_NESTED_HOT_TEAMS
1402   kmp_hot_team_ptr_t **p_hot_teams;
1403 #endif
1404   { // KMP_TIME_BLOCK
1405     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1406     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1407 
1408     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1409     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1410       /* Some systems prefer the stack for the root thread(s) to start with */
1411       /* some gap from the parent stack to prevent false sharing. */
1412       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1413       /* These 2 lines below are so this does not get optimized out */
1414       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1415         __kmp_stkpadding += (short)((kmp_int64)dummy);
1416     }
1417 
1418     /* initialize if needed */
1419     KMP_DEBUG_ASSERT(
1420         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1421     if (!TCR_4(__kmp_init_parallel))
1422       __kmp_parallel_initialize();
1423 
1424     /* setup current data */
1425     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1426     // shutdown
1427     parent_team = master_th->th.th_team;
1428     master_tid = master_th->th.th_info.ds.ds_tid;
1429     master_this_cons = master_th->th.th_local.this_construct;
1430     root = master_th->th.th_root;
1431     master_active = root->r.r_active;
1432     master_set_numthreads = master_th->th.th_set_nproc;
1433 
1434 #if OMPT_SUPPORT
1435     ompt_parallel_id_t ompt_parallel_id;
1436     ompt_task_id_t ompt_task_id;
1437     ompt_frame_t *ompt_frame;
1438     ompt_task_id_t my_task_id;
1439     ompt_parallel_id_t my_parallel_id;
1440 
1441     if (ompt_enabled) {
1442       ompt_parallel_id = __ompt_parallel_id_new(gtid);
1443       ompt_task_id = __ompt_get_task_id_internal(0);
1444       ompt_frame = __ompt_get_task_frame_internal(0);
1445     }
1446 #endif
1447 
1448     // Nested level will be an index in the nested nthreads array
1449     level = parent_team->t.t_level;
1450     // used to launch non-serial teams even if nested is not allowed
1451     active_level = parent_team->t.t_active_level;
1452 #if OMP_40_ENABLED
1453     // needed to check nesting inside the teams
1454     teams_level = master_th->th.th_teams_level;
1455 #endif
1456 #if KMP_NESTED_HOT_TEAMS
1457     p_hot_teams = &master_th->th.th_hot_teams;
1458     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1459       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1460           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1461       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1462       // it is either actual or not needed (when active_level > 0)
1463       (*p_hot_teams)[0].hot_team_nth = 1;
1464     }
1465 #endif
1466 
1467 #if OMPT_SUPPORT
1468     if (ompt_enabled &&
1469         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1470       int team_size = master_set_numthreads;
1471 
1472       ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1473           ompt_task_id, ompt_frame, ompt_parallel_id, team_size, unwrapped_task,
1474           OMPT_INVOKER(call_context));
1475     }
1476 #endif
1477 
1478     master_th->th.th_ident = loc;
1479 
1480 #if OMP_40_ENABLED
1481     if (master_th->th.th_teams_microtask && ap &&
1482         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1483       // AC: This is start of parallel that is nested inside teams construct.
1484       // The team is actual (hot), all workers are ready at the fork barrier.
1485       // No lock needed to initialize the team a bit, then free workers.
1486       parent_team->t.t_ident = loc;
1487       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1488       parent_team->t.t_argc = argc;
1489       argv = (void **)parent_team->t.t_argv;
1490       for (i = argc - 1; i >= 0; --i)
1491 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1492 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1493         *argv++ = va_arg(*ap, void *);
1494 #else
1495         *argv++ = va_arg(ap, void *);
1496 #endif
1497       // Increment our nested depth levels, but not increase the serialization
1498       if (parent_team == master_th->th.th_serial_team) {
1499         // AC: we are in serialized parallel
1500         __kmpc_serialized_parallel(loc, gtid);
1501         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1502         // AC: need this in order enquiry functions work
1503         // correctly, will restore at join time
1504         parent_team->t.t_serialized--;
1505 #if OMPT_SUPPORT
1506         void *dummy;
1507         void **exit_runtime_p;
1508 
1509         ompt_lw_taskteam_t lw_taskteam;
1510 
1511         if (ompt_enabled) {
1512           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, unwrapped_task,
1513                                   ompt_parallel_id);
1514           lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1515           exit_runtime_p =
1516               &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1517 
1518           __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1519 
1520 #if OMPT_TRACE
1521           /* OMPT implicit task begin */
1522           my_task_id = lw_taskteam.ompt_task_info.task_id;
1523           my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1524           if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1525             ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1526                 my_parallel_id, my_task_id);
1527           }
1528 #endif
1529 
1530           /* OMPT state */
1531           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1532         } else {
1533           exit_runtime_p = &dummy;
1534         }
1535 #endif
1536 
1537         {
1538           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1539           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1540           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1541 #if OMPT_SUPPORT
1542                                  ,
1543                                  exit_runtime_p
1544 #endif
1545                                  );
1546         }
1547 
1548 #if OMPT_SUPPORT
1549         *exit_runtime_p = NULL;
1550         if (ompt_enabled) {
1551 #if OMPT_TRACE
1552           lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1553 
1554           if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1555             ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1556                 ompt_parallel_id, ompt_task_id);
1557           }
1558 
1559           __ompt_lw_taskteam_unlink(master_th);
1560           // reset clear the task id only after unlinking the task
1561           lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1562 #endif
1563 
1564           if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1565             ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1566                 ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1567           }
1568           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1569         }
1570 #endif
1571         return TRUE;
1572       }
1573 
1574       parent_team->t.t_pkfn = microtask;
1575 #if OMPT_SUPPORT
1576       parent_team->t.ompt_team_info.microtask = unwrapped_task;
1577 #endif
1578       parent_team->t.t_invoke = invoker;
1579       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1580       parent_team->t.t_active_level++;
1581       parent_team->t.t_level++;
1582 
1583       /* Change number of threads in the team if requested */
1584       if (master_set_numthreads) { // The parallel has num_threads clause
1585         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1586           // AC: only can reduce number of threads dynamically, can't increase
1587           kmp_info_t **other_threads = parent_team->t.t_threads;
1588           parent_team->t.t_nproc = master_set_numthreads;
1589           for (i = 0; i < master_set_numthreads; ++i) {
1590             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591           }
1592           // Keep extra threads hot in the team for possible next parallels
1593         }
1594         master_th->th.th_set_nproc = 0;
1595       }
1596 
1597 #if USE_DEBUGGER
1598       if (__kmp_debugging) { // Let debugger override number of threads.
1599         int nth = __kmp_omp_num_threads(loc);
1600         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601           master_set_numthreads = nth;
1602         }
1603       }
1604 #endif
1605 
1606       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1607                     "master_th=%p, gtid=%d\n",
1608                     root, parent_team, master_th, gtid));
1609       __kmp_internal_fork(loc, gtid, parent_team);
1610       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1611                     "master_th=%p, gtid=%d\n",
1612                     root, parent_team, master_th, gtid));
1613 
1614       /* Invoke microtask for MASTER thread */
1615       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1616                     parent_team->t.t_id, parent_team->t.t_pkfn));
1617 
1618       {
1619         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1620         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1621         if (!parent_team->t.t_invoke(gtid)) {
1622           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1623         }
1624       }
1625       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1626                     parent_team->t.t_id, parent_team->t.t_pkfn));
1627       KMP_MB(); /* Flush all pending memory write invalidates.  */
1628 
1629       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1630 
1631       return TRUE;
1632     } // Parallel closely nested in teams construct
1633 #endif /* OMP_40_ENABLED */
1634 
1635 #if KMP_DEBUG
1636     if (__kmp_tasking_mode != tskm_immediate_exec) {
1637       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1638                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1639     }
1640 #endif
1641 
1642     if (parent_team->t.t_active_level >=
1643         master_th->th.th_current_task->td_icvs.max_active_levels) {
1644       nthreads = 1;
1645     } else {
1646 #if OMP_40_ENABLED
1647       int enter_teams = ((ap == NULL && active_level == 0) ||
1648                          (ap && teams_level > 0 && teams_level == level));
1649 #endif
1650       nthreads =
1651           master_set_numthreads
1652               ? master_set_numthreads
1653               : get__nproc_2(
1654                     parent_team,
1655                     master_tid); // TODO: get nproc directly from current task
1656 
1657       // Check if we need to take forkjoin lock? (no need for serialized
1658       // parallel out of teams construct). This code moved here from
1659       // __kmp_reserve_threads() to speedup nested serialized parallels.
1660       if (nthreads > 1) {
1661         if ((!get__nested(master_th) && (root->r.r_in_parallel
1662 #if OMP_40_ENABLED
1663                                          && !enter_teams
1664 #endif /* OMP_40_ENABLED */
1665                                          )) ||
1666             (__kmp_library == library_serial)) {
1667           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1668                         " threads\n",
1669                         gtid, nthreads));
1670           nthreads = 1;
1671         }
1672       }
1673       if (nthreads > 1) {
1674         /* determine how many new threads we can use */
1675         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1676         nthreads = __kmp_reserve_threads(
1677             root, parent_team, master_tid, nthreads
1678 #if OMP_40_ENABLED
1679             /* AC: If we execute teams from parallel region (on host), then
1680                teams should be created but each can only have 1 thread if
1681                nesting is disabled. If teams called from serial region, then
1682                teams and their threads should be created regardless of the
1683                nesting setting. */
1684             ,
1685             enter_teams
1686 #endif /* OMP_40_ENABLED */
1687             );
1688         if (nthreads == 1) {
1689           // Free lock for single thread execution here; for multi-thread
1690           // execution it will be freed later after team of threads created
1691           // and initialized
1692           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1693         }
1694       }
1695     }
1696     KMP_DEBUG_ASSERT(nthreads > 0);
1697 
1698     // If we temporarily changed the set number of threads then restore it now
1699     master_th->th.th_set_nproc = 0;
1700 
1701     /* create a serialized parallel region? */
1702     if (nthreads == 1) {
1703 /* josh todo: hypothetical question: what do we do for OS X*? */
1704 #if KMP_OS_LINUX &&                                                            \
1705     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1706       void *args[argc];
1707 #else
1708       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1709 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1710           KMP_ARCH_AARCH64) */
1711 
1712       KA_TRACE(20,
1713                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1714 
1715       __kmpc_serialized_parallel(loc, gtid);
1716 
1717       if (call_context == fork_context_intel) {
1718         /* TODO this sucks, use the compiler itself to pass args! :) */
1719         master_th->th.th_serial_team->t.t_ident = loc;
1720 #if OMP_40_ENABLED
1721         if (!ap) {
1722           // revert change made in __kmpc_serialized_parallel()
1723           master_th->th.th_serial_team->t.t_level--;
1724 // Get args from parent team for teams construct
1725 
1726 #if OMPT_SUPPORT
1727           void *dummy;
1728           void **exit_runtime_p;
1729 
1730           ompt_lw_taskteam_t lw_taskteam;
1731 
1732           if (ompt_enabled) {
1733             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1734                                     unwrapped_task, ompt_parallel_id);
1735             lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1736             exit_runtime_p =
1737                 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1738 
1739             __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1740 
1741 #if OMPT_TRACE
1742             my_task_id = lw_taskteam.ompt_task_info.task_id;
1743             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1744               ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1745                   ompt_parallel_id, my_task_id);
1746             }
1747 #endif
1748 
1749             /* OMPT state */
1750             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1751           } else {
1752             exit_runtime_p = &dummy;
1753           }
1754 #endif
1755 
1756           {
1757             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1758             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1759             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1760                                    parent_team->t.t_argv
1761 #if OMPT_SUPPORT
1762                                    ,
1763                                    exit_runtime_p
1764 #endif
1765                                    );
1766           }
1767 
1768 #if OMPT_SUPPORT
1769           *exit_runtime_p = NULL;
1770           if (ompt_enabled) {
1771             lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1772 
1773 #if OMPT_TRACE
1774             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1775               ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1776                   ompt_parallel_id, ompt_task_id);
1777             }
1778 #endif
1779 
1780             __ompt_lw_taskteam_unlink(master_th);
1781             // reset clear the task id only after unlinking the task
1782             lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1783 
1784             if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1785               ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1786                   ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1787             }
1788             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1789           }
1790 #endif
1791         } else if (microtask == (microtask_t)__kmp_teams_master) {
1792           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1793                            master_th->th.th_serial_team);
1794           team = master_th->th.th_team;
1795           // team->t.t_pkfn = microtask;
1796           team->t.t_invoke = invoker;
1797           __kmp_alloc_argv_entries(argc, team, TRUE);
1798           team->t.t_argc = argc;
1799           argv = (void **)team->t.t_argv;
1800           if (ap) {
1801             for (i = argc - 1; i >= 0; --i)
1802 // TODO: revert workaround for Intel(R) 64 tracker #96
1803 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1804               *argv++ = va_arg(*ap, void *);
1805 #else
1806               *argv++ = va_arg(ap, void *);
1807 #endif
1808           } else {
1809             for (i = 0; i < argc; ++i)
1810               // Get args from parent team for teams construct
1811               argv[i] = parent_team->t.t_argv[i];
1812           }
1813           // AC: revert change made in __kmpc_serialized_parallel()
1814           //     because initial code in teams should have level=0
1815           team->t.t_level--;
1816           // AC: call special invoker for outer "parallel" of teams construct
1817           {
1818             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1819             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1820             invoker(gtid);
1821           }
1822         } else {
1823 #endif /* OMP_40_ENABLED */
1824           argv = args;
1825           for (i = argc - 1; i >= 0; --i)
1826 // TODO: revert workaround for Intel(R) 64 tracker #96
1827 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1828             *argv++ = va_arg(*ap, void *);
1829 #else
1830           *argv++ = va_arg(ap, void *);
1831 #endif
1832           KMP_MB();
1833 
1834 #if OMPT_SUPPORT
1835           void *dummy;
1836           void **exit_runtime_p;
1837 
1838           ompt_lw_taskteam_t lw_taskteam;
1839 
1840           if (ompt_enabled) {
1841             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1842                                     unwrapped_task, ompt_parallel_id);
1843             lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1844             exit_runtime_p =
1845                 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1846 
1847             __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1848 
1849 #if OMPT_TRACE
1850             /* OMPT implicit task begin */
1851             my_task_id = lw_taskteam.ompt_task_info.task_id;
1852             my_parallel_id = ompt_parallel_id;
1853             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1854               ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1855                   my_parallel_id, my_task_id);
1856             }
1857 #endif
1858 
1859             /* OMPT state */
1860             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1861           } else {
1862             exit_runtime_p = &dummy;
1863           }
1864 #endif
1865 
1866           {
1867             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1868             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1869             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1870 #if OMPT_SUPPORT
1871                                    ,
1872                                    exit_runtime_p
1873 #endif
1874                                    );
1875           }
1876 
1877 #if OMPT_SUPPORT
1878           *exit_runtime_p = NULL;
1879           if (ompt_enabled) {
1880 #if OMPT_TRACE
1881             lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1882 
1883             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1884               ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1885                   my_parallel_id, my_task_id);
1886             }
1887 #endif
1888 
1889             __ompt_lw_taskteam_unlink(master_th);
1890             // reset clear the task id only after unlinking the task
1891             lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1892 
1893             if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1894               ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1895                   ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1896             }
1897             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1898           }
1899 #endif
1900 #if OMP_40_ENABLED
1901         }
1902 #endif /* OMP_40_ENABLED */
1903       } else if (call_context == fork_context_gnu) {
1904 #if OMPT_SUPPORT
1905         ompt_lw_taskteam_t *lwt =
1906             (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
1907         __ompt_lw_taskteam_init(lwt, master_th, gtid, unwrapped_task,
1908                                 ompt_parallel_id);
1909 
1910         lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1911         lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1912         __ompt_lw_taskteam_link(lwt, master_th);
1913 #endif
1914 
1915         // we were called from GNU native code
1916         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1917         return FALSE;
1918       } else {
1919         KMP_ASSERT2(call_context < fork_context_last,
1920                     "__kmp_fork_call: unknown fork_context parameter");
1921       }
1922 
1923       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1924       KMP_MB();
1925       return FALSE;
1926     }
1927 
1928     // GEH: only modify the executing flag in the case when not serialized
1929     //      serialized case is handled in kmpc_serialized_parallel
1930     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1931                   "curtask=%p, curtask_max_aclevel=%d\n",
1932                   parent_team->t.t_active_level, master_th,
1933                   master_th->th.th_current_task,
1934                   master_th->th.th_current_task->td_icvs.max_active_levels));
1935     // TODO: GEH - cannot do this assertion because root thread not set up as
1936     // executing
1937     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1938     master_th->th.th_current_task->td_flags.executing = 0;
1939 
1940 #if OMP_40_ENABLED
1941     if (!master_th->th.th_teams_microtask || level > teams_level)
1942 #endif /* OMP_40_ENABLED */
1943     {
1944       /* Increment our nested depth level */
1945       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1946     }
1947 
1948     // See if we need to make a copy of the ICVs.
1949     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1950     if ((level + 1 < __kmp_nested_nth.used) &&
1951         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1952       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1953     } else {
1954       nthreads_icv = 0; // don't update
1955     }
1956 
1957 #if OMP_40_ENABLED
1958     // Figure out the proc_bind_policy for the new team.
1959     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1960     kmp_proc_bind_t proc_bind_icv =
1961         proc_bind_default; // proc_bind_default means don't update
1962     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1963       proc_bind = proc_bind_false;
1964     } else {
1965       if (proc_bind == proc_bind_default) {
1966         // No proc_bind clause specified; use current proc-bind-var for this
1967         // parallel region
1968         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1969       }
1970       /* else: The proc_bind policy was specified explicitly on parallel clause.
1971          This overrides proc-bind-var for this parallel region, but does not
1972          change proc-bind-var. */
1973       // Figure the value of proc-bind-var for the child threads.
1974       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1975           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1976            master_th->th.th_current_task->td_icvs.proc_bind)) {
1977         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1978       }
1979     }
1980 
1981     // Reset for next parallel region
1982     master_th->th.th_set_proc_bind = proc_bind_default;
1983 #endif /* OMP_40_ENABLED */
1984 
1985     if ((nthreads_icv > 0)
1986 #if OMP_40_ENABLED
1987         || (proc_bind_icv != proc_bind_default)
1988 #endif /* OMP_40_ENABLED */
1989             ) {
1990       kmp_internal_control_t new_icvs;
1991       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1992       new_icvs.next = NULL;
1993       if (nthreads_icv > 0) {
1994         new_icvs.nproc = nthreads_icv;
1995       }
1996 
1997 #if OMP_40_ENABLED
1998       if (proc_bind_icv != proc_bind_default) {
1999         new_icvs.proc_bind = proc_bind_icv;
2000       }
2001 #endif /* OMP_40_ENABLED */
2002 
2003       /* allocate a new parallel team */
2004       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2005       team = __kmp_allocate_team(root, nthreads, nthreads,
2006 #if OMPT_SUPPORT
2007                                  ompt_parallel_id,
2008 #endif
2009 #if OMP_40_ENABLED
2010                                  proc_bind,
2011 #endif
2012                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2013     } else {
2014       /* allocate a new parallel team */
2015       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2016       team = __kmp_allocate_team(root, nthreads, nthreads,
2017 #if OMPT_SUPPORT
2018                                  ompt_parallel_id,
2019 #endif
2020 #if OMP_40_ENABLED
2021                                  proc_bind,
2022 #endif
2023                                  &master_th->th.th_current_task->td_icvs,
2024                                  argc USE_NESTED_HOT_ARG(master_th));
2025     }
2026     KF_TRACE(
2027         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2028 
2029     /* setup the new team */
2030     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2031     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2032     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2033     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2034     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2035 #if OMPT_SUPPORT
2036     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2037 #endif
2038     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2039 // TODO: parent_team->t.t_level == INT_MAX ???
2040 #if OMP_40_ENABLED
2041     if (!master_th->th.th_teams_microtask || level > teams_level) {
2042 #endif /* OMP_40_ENABLED */
2043       int new_level = parent_team->t.t_level + 1;
2044       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2045       new_level = parent_team->t.t_active_level + 1;
2046       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2047 #if OMP_40_ENABLED
2048     } else {
2049       // AC: Do not increase parallel level at start of the teams construct
2050       int new_level = parent_team->t.t_level;
2051       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052       new_level = parent_team->t.t_active_level;
2053       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054     }
2055 #endif /* OMP_40_ENABLED */
2056     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2057     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
2058         team->t.t_sched.chunk != new_sched.chunk)
2059       team->t.t_sched =
2060           new_sched; // set master's schedule as new run-time schedule
2061 
2062 #if OMP_40_ENABLED
2063     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2064 #endif
2065 
2066     // Update the floating point rounding in the team if required.
2067     propagateFPControl(team);
2068 
2069     if (__kmp_tasking_mode != tskm_immediate_exec) {
2070       // Set master's task team to team's task team. Unless this is hot team, it
2071       // should be NULL.
2072       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2073                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2074       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2075                     "%p, new task_team %p / team %p\n",
2076                     __kmp_gtid_from_thread(master_th),
2077                     master_th->th.th_task_team, parent_team,
2078                     team->t.t_task_team[master_th->th.th_task_state], team));
2079 
2080       if (active_level || master_th->th.th_task_team) {
2081         // Take a memo of master's task_state
2082         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2083         if (master_th->th.th_task_state_top >=
2084             master_th->th.th_task_state_stack_sz) { // increase size
2085           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2086           kmp_uint8 *old_stack, *new_stack;
2087           kmp_uint32 i;
2088           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2089           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2090             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2091           }
2092           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2093                ++i) { // zero-init rest of stack
2094             new_stack[i] = 0;
2095           }
2096           old_stack = master_th->th.th_task_state_memo_stack;
2097           master_th->th.th_task_state_memo_stack = new_stack;
2098           master_th->th.th_task_state_stack_sz = new_size;
2099           __kmp_free(old_stack);
2100         }
2101         // Store master's task_state on stack
2102         master_th->th
2103             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2104             master_th->th.th_task_state;
2105         master_th->th.th_task_state_top++;
2106 #if KMP_NESTED_HOT_TEAMS
2107         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2108           // Restore master's nested state if nested hot team
2109           master_th->th.th_task_state =
2110               master_th->th
2111                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2112         } else {
2113 #endif
2114           master_th->th.th_task_state = 0;
2115 #if KMP_NESTED_HOT_TEAMS
2116         }
2117 #endif
2118       }
2119 #if !KMP_NESTED_HOT_TEAMS
2120       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2121                        (team == root->r.r_hot_team));
2122 #endif
2123     }
2124 
2125     KA_TRACE(
2126         20,
2127         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2128          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2129          team->t.t_nproc));
2130     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2131                      (team->t.t_master_tid == 0 &&
2132                       (team->t.t_parent == root->r.r_root_team ||
2133                        team->t.t_parent->t.t_serialized)));
2134     KMP_MB();
2135 
2136     /* now, setup the arguments */
2137     argv = (void **)team->t.t_argv;
2138 #if OMP_40_ENABLED
2139     if (ap) {
2140 #endif /* OMP_40_ENABLED */
2141       for (i = argc - 1; i >= 0; --i) {
2142 // TODO: revert workaround for Intel(R) 64 tracker #96
2143 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2144         void *new_argv = va_arg(*ap, void *);
2145 #else
2146       void *new_argv = va_arg(ap, void *);
2147 #endif
2148         KMP_CHECK_UPDATE(*argv, new_argv);
2149         argv++;
2150       }
2151 #if OMP_40_ENABLED
2152     } else {
2153       for (i = 0; i < argc; ++i) {
2154         // Get args from parent team for teams construct
2155         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2156       }
2157     }
2158 #endif /* OMP_40_ENABLED */
2159 
2160     /* now actually fork the threads */
2161     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2162     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2163       root->r.r_active = TRUE;
2164 
2165     __kmp_fork_team_threads(root, team, master_th, gtid);
2166     __kmp_setup_icv_copy(team, nthreads,
2167                          &master_th->th.th_current_task->td_icvs, loc);
2168 
2169 #if OMPT_SUPPORT
2170     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2171 #endif
2172 
2173     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2174 
2175 #if USE_ITT_BUILD
2176     if (team->t.t_active_level == 1 // only report frames at level 1
2177 #if OMP_40_ENABLED
2178         && !master_th->th.th_teams_microtask // not in teams construct
2179 #endif /* OMP_40_ENABLED */
2180         ) {
2181 #if USE_ITT_NOTIFY
2182       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2183           (__kmp_forkjoin_frames_mode == 3 ||
2184            __kmp_forkjoin_frames_mode == 1)) {
2185         kmp_uint64 tmp_time = 0;
2186         if (__itt_get_timestamp_ptr)
2187           tmp_time = __itt_get_timestamp();
2188         // Internal fork - report frame begin
2189         master_th->th.th_frame_time = tmp_time;
2190         if (__kmp_forkjoin_frames_mode == 3)
2191           team->t.t_region_time = tmp_time;
2192       } else
2193 // only one notification scheme (either "submit" or "forking/joined", not both)
2194 #endif /* USE_ITT_NOTIFY */
2195           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2196               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2197         // Mark start of "parallel" region for VTune.
2198         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2199       }
2200     }
2201 #endif /* USE_ITT_BUILD */
2202 
2203     /* now go on and do the work */
2204     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2205     KMP_MB();
2206     KF_TRACE(10,
2207              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2208               root, team, master_th, gtid));
2209 
2210 #if USE_ITT_BUILD
2211     if (__itt_stack_caller_create_ptr) {
2212       team->t.t_stack_id =
2213           __kmp_itt_stack_caller_create(); // create new stack stitching id
2214       // before entering fork barrier
2215     }
2216 #endif /* USE_ITT_BUILD */
2217 
2218 #if OMP_40_ENABLED
2219     // AC: skip __kmp_internal_fork at teams construct, let only master
2220     // threads execute
2221     if (ap)
2222 #endif /* OMP_40_ENABLED */
2223     {
2224       __kmp_internal_fork(loc, gtid, team);
2225       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2226                     "master_th=%p, gtid=%d\n",
2227                     root, team, master_th, gtid));
2228     }
2229 
2230     if (call_context == fork_context_gnu) {
2231       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2232       return TRUE;
2233     }
2234 
2235     /* Invoke microtask for MASTER thread */
2236     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2237                   team->t.t_id, team->t.t_pkfn));
2238   } // END of timer KMP_fork_call block
2239 
2240   {
2241     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2242     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2243     if (!team->t.t_invoke(gtid)) {
2244       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2245     }
2246   }
2247   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2248                 team->t.t_id, team->t.t_pkfn));
2249   KMP_MB(); /* Flush all pending memory write invalidates.  */
2250 
2251   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2252 
2253 #if OMPT_SUPPORT
2254   if (ompt_enabled) {
2255     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2256   }
2257 #endif
2258 
2259   return TRUE;
2260 }
2261 
2262 #if OMPT_SUPPORT
2263 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2264                                             kmp_team_t *team) {
2265   // restore state outside the region
2266   thread->th.ompt_thread_info.state =
2267       ((team->t.t_serialized) ? ompt_state_work_serial
2268                               : ompt_state_work_parallel);
2269 }
2270 
2271 static inline void __kmp_join_ompt(kmp_info_t *thread, kmp_team_t *team,
2272                                    ompt_parallel_id_t parallel_id,
2273                                    fork_context_e fork_context) {
2274   ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2275   if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2276     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2277         parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2278   }
2279 
2280   task_info->frame.reenter_runtime_frame = NULL;
2281   __kmp_join_restore_state(thread, team);
2282 }
2283 #endif
2284 
2285 void __kmp_join_call(ident_t *loc, int gtid
2286 #if OMPT_SUPPORT
2287                      ,
2288                      enum fork_context_e fork_context
2289 #endif
2290 #if OMP_40_ENABLED
2291                      ,
2292                      int exit_teams
2293 #endif /* OMP_40_ENABLED */
2294                      ) {
2295   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2296   kmp_team_t *team;
2297   kmp_team_t *parent_team;
2298   kmp_info_t *master_th;
2299   kmp_root_t *root;
2300   int master_active;
2301   int i;
2302 
2303   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305   /* setup current data */
2306   master_th = __kmp_threads[gtid];
2307   root = master_th->th.th_root;
2308   team = master_th->th.th_team;
2309   parent_team = team->t.t_parent;
2310 
2311   master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314   if (ompt_enabled) {
2315     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2316   }
2317 #endif
2318 
2319 #if KMP_DEBUG
2320   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2321     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2322                   "th_task_team = %p\n",
2323                   __kmp_gtid_from_thread(master_th), team,
2324                   team->t.t_task_team[master_th->th.th_task_state],
2325                   master_th->th.th_task_team));
2326     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2327                      team->t.t_task_team[master_th->th.th_task_state]);
2328   }
2329 #endif
2330 
2331   if (team->t.t_serialized) {
2332 #if OMP_40_ENABLED
2333     if (master_th->th.th_teams_microtask) {
2334       // We are in teams construct
2335       int level = team->t.t_level;
2336       int tlevel = master_th->th.th_teams_level;
2337       if (level == tlevel) {
2338         // AC: we haven't incremented it earlier at start of teams construct,
2339         //     so do it here - at the end of teams construct
2340         team->t.t_level++;
2341       } else if (level == tlevel + 1) {
2342         // AC: we are exiting parallel inside teams, need to increment
2343         // serialization in order to restore it in the next call to
2344         // __kmpc_end_serialized_parallel
2345         team->t.t_serialized++;
2346       }
2347     }
2348 #endif /* OMP_40_ENABLED */
2349     __kmpc_end_serialized_parallel(loc, gtid);
2350 
2351 #if OMPT_SUPPORT
2352     if (ompt_enabled) {
2353       __kmp_join_restore_state(master_th, parent_team);
2354     }
2355 #endif
2356 
2357     return;
2358   }
2359 
2360   master_active = team->t.t_master_active;
2361 
2362 #if OMP_40_ENABLED
2363   if (!exit_teams)
2364 #endif /* OMP_40_ENABLED */
2365   {
2366     // AC: No barrier for internal teams at exit from teams construct.
2367     //     But there is barrier for external team (league).
2368     __kmp_internal_join(loc, gtid, team);
2369   }
2370 #if OMP_40_ENABLED
2371   else {
2372     master_th->th.th_task_state =
2373         0; // AC: no tasking in teams (out of any parallel)
2374   }
2375 #endif /* OMP_40_ENABLED */
2376 
2377   KMP_MB();
2378 
2379 #if OMPT_SUPPORT
2380   ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2381 #endif
2382 
2383 #if USE_ITT_BUILD
2384   if (__itt_stack_caller_create_ptr) {
2385     __kmp_itt_stack_caller_destroy(
2386         (__itt_caller)team->t
2387             .t_stack_id); // destroy the stack stitching id after join barrier
2388   }
2389 
2390   // Mark end of "parallel" region for VTune.
2391   if (team->t.t_active_level == 1
2392 #if OMP_40_ENABLED
2393       && !master_th->th.th_teams_microtask /* not in teams construct */
2394 #endif /* OMP_40_ENABLED */
2395       ) {
2396     master_th->th.th_ident = loc;
2397     // only one notification scheme (either "submit" or "forking/joined", not
2398     // both)
2399     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2400         __kmp_forkjoin_frames_mode == 3)
2401       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2402                              master_th->th.th_frame_time, 0, loc,
2403                              master_th->th.th_team_nproc, 1);
2404     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2405              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2406       __kmp_itt_region_joined(gtid);
2407   } // active_level == 1
2408 #endif /* USE_ITT_BUILD */
2409 
2410 #if OMP_40_ENABLED
2411   if (master_th->th.th_teams_microtask && !exit_teams &&
2412       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2413       team->t.t_level == master_th->th.th_teams_level + 1) {
2414     // AC: We need to leave the team structure intact at the end of parallel
2415     // inside the teams construct, so that at the next parallel same (hot) team
2416     // works, only adjust nesting levels
2417 
2418     /* Decrement our nested depth level */
2419     team->t.t_level--;
2420     team->t.t_active_level--;
2421     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2422 
2423     /* Restore number of threads in the team if needed */
2424     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2425       int old_num = master_th->th.th_team_nproc;
2426       int new_num = master_th->th.th_teams_size.nth;
2427       kmp_info_t **other_threads = team->t.t_threads;
2428       team->t.t_nproc = new_num;
2429       for (i = 0; i < old_num; ++i) {
2430         other_threads[i]->th.th_team_nproc = new_num;
2431       }
2432       // Adjust states of non-used threads of the team
2433       for (i = old_num; i < new_num; ++i) {
2434         // Re-initialize thread's barrier data.
2435         int b;
2436         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2437         for (b = 0; b < bs_last_barrier; ++b) {
2438           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2439           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2440 #if USE_DEBUGGER
2441           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2442 #endif
2443         }
2444         if (__kmp_tasking_mode != tskm_immediate_exec) {
2445           // Synchronize thread's task state
2446           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2447         }
2448       }
2449     }
2450 
2451 #if OMPT_SUPPORT
2452     if (ompt_enabled) {
2453       __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2454     }
2455 #endif
2456 
2457     return;
2458   }
2459 #endif /* OMP_40_ENABLED */
2460 
2461   /* do cleanup and restore the parent team */
2462   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2463   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2464 
2465   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2466 
2467   /* jc: The following lock has instructions with REL and ACQ semantics,
2468      separating the parallel user code called in this parallel region
2469      from the serial user code called after this function returns. */
2470   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2471 
2472 #if OMP_40_ENABLED
2473   if (!master_th->th.th_teams_microtask ||
2474       team->t.t_level > master_th->th.th_teams_level)
2475 #endif /* OMP_40_ENABLED */
2476   {
2477     /* Decrement our nested depth level */
2478     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2479   }
2480   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2481 
2482 #if OMPT_SUPPORT && OMPT_TRACE
2483   if (ompt_enabled) {
2484     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2485     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2486       ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2487           parallel_id, task_info->task_id);
2488     }
2489     task_info->frame.exit_runtime_frame = NULL;
2490     task_info->task_id = 0;
2491   }
2492 #endif
2493 
2494   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2495                 master_th, team));
2496   __kmp_pop_current_task_from_thread(master_th);
2497 
2498 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2499   // Restore master thread's partition.
2500   master_th->th.th_first_place = team->t.t_first_place;
2501   master_th->th.th_last_place = team->t.t_last_place;
2502 #endif /* OMP_40_ENABLED */
2503 
2504   updateHWFPControl(team);
2505 
2506   if (root->r.r_active != master_active)
2507     root->r.r_active = master_active;
2508 
2509   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2510                             master_th)); // this will free worker threads
2511 
2512   /* this race was fun to find. make sure the following is in the critical
2513      region otherwise assertions may fail occasionally since the old team may be
2514      reallocated and the hierarchy appears inconsistent. it is actually safe to
2515      run and won't cause any bugs, but will cause those assertion failures. it's
2516      only one deref&assign so might as well put this in the critical region */
2517   master_th->th.th_team = parent_team;
2518   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2519   master_th->th.th_team_master = parent_team->t.t_threads[0];
2520   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2521 
2522   /* restore serialized team, if need be */
2523   if (parent_team->t.t_serialized &&
2524       parent_team != master_th->th.th_serial_team &&
2525       parent_team != root->r.r_root_team) {
2526     __kmp_free_team(root,
2527                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2528     master_th->th.th_serial_team = parent_team;
2529   }
2530 
2531   if (__kmp_tasking_mode != tskm_immediate_exec) {
2532     if (master_th->th.th_task_state_top >
2533         0) { // Restore task state from memo stack
2534       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2535       // Remember master's state if we re-use this nested hot team
2536       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2537           master_th->th.th_task_state;
2538       --master_th->th.th_task_state_top; // pop
2539       // Now restore state at this level
2540       master_th->th.th_task_state =
2541           master_th->th
2542               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2543     }
2544     // Copy the task team from the parent team to the master thread
2545     master_th->th.th_task_team =
2546         parent_team->t.t_task_team[master_th->th.th_task_state];
2547     KA_TRACE(20,
2548              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2549               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2550               parent_team));
2551   }
2552 
2553   // TODO: GEH - cannot do this assertion because root thread not set up as
2554   // executing
2555   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2556   master_th->th.th_current_task->td_flags.executing = 1;
2557 
2558   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2559 
2560 #if OMPT_SUPPORT
2561   if (ompt_enabled) {
2562     __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2563   }
2564 #endif
2565 
2566   KMP_MB();
2567   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2568 }
2569 
2570 /* Check whether we should push an internal control record onto the
2571    serial team stack.  If so, do it.  */
2572 void __kmp_save_internal_controls(kmp_info_t *thread) {
2573 
2574   if (thread->th.th_team != thread->th.th_serial_team) {
2575     return;
2576   }
2577   if (thread->th.th_team->t.t_serialized > 1) {
2578     int push = 0;
2579 
2580     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2581       push = 1;
2582     } else {
2583       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2584           thread->th.th_team->t.t_serialized) {
2585         push = 1;
2586       }
2587     }
2588     if (push) { /* push a record on the serial team's stack */
2589       kmp_internal_control_t *control =
2590           (kmp_internal_control_t *)__kmp_allocate(
2591               sizeof(kmp_internal_control_t));
2592 
2593       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2594 
2595       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2596 
2597       control->next = thread->th.th_team->t.t_control_stack_top;
2598       thread->th.th_team->t.t_control_stack_top = control;
2599     }
2600   }
2601 }
2602 
2603 /* Changes set_nproc */
2604 void __kmp_set_num_threads(int new_nth, int gtid) {
2605   kmp_info_t *thread;
2606   kmp_root_t *root;
2607 
2608   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2609   KMP_DEBUG_ASSERT(__kmp_init_serial);
2610 
2611   if (new_nth < 1)
2612     new_nth = 1;
2613   else if (new_nth > __kmp_max_nth)
2614     new_nth = __kmp_max_nth;
2615 
2616   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2617   thread = __kmp_threads[gtid];
2618 
2619   __kmp_save_internal_controls(thread);
2620 
2621   set__nproc(thread, new_nth);
2622 
2623   // If this omp_set_num_threads() call will cause the hot team size to be
2624   // reduced (in the absence of a num_threads clause), then reduce it now,
2625   // rather than waiting for the next parallel region.
2626   root = thread->th.th_root;
2627   if (__kmp_init_parallel && (!root->r.r_active) &&
2628       (root->r.r_hot_team->t.t_nproc > new_nth)
2629 #if KMP_NESTED_HOT_TEAMS
2630       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2631 #endif
2632       ) {
2633     kmp_team_t *hot_team = root->r.r_hot_team;
2634     int f;
2635 
2636     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2637 
2638     // Release the extra threads we don't need any more.
2639     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2640       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2641       if (__kmp_tasking_mode != tskm_immediate_exec) {
2642         // When decreasing team size, threads no longer in the team should unref
2643         // task team.
2644         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2645       }
2646       __kmp_free_thread(hot_team->t.t_threads[f]);
2647       hot_team->t.t_threads[f] = NULL;
2648     }
2649     hot_team->t.t_nproc = new_nth;
2650 #if KMP_NESTED_HOT_TEAMS
2651     if (thread->th.th_hot_teams) {
2652       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2653       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2654     }
2655 #endif
2656 
2657     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659     // Update the t_nproc field in the threads that are still active.
2660     for (f = 0; f < new_nth; f++) {
2661       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2663     }
2664     // Special flag in case omp_set_num_threads() call
2665     hot_team->t.t_size_changed = -1;
2666   }
2667 }
2668 
2669 /* Changes max_active_levels */
2670 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2671   kmp_info_t *thread;
2672 
2673   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2674                 "%d = (%d)\n",
2675                 gtid, max_active_levels));
2676   KMP_DEBUG_ASSERT(__kmp_init_serial);
2677 
2678   // validate max_active_levels
2679   if (max_active_levels < 0) {
2680     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2681     // We ignore this call if the user has specified a negative value.
2682     // The current setting won't be changed. The last valid setting will be
2683     // used. A warning will be issued (if warnings are allowed as controlled by
2684     // the KMP_WARNINGS env var).
2685     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2686                   "max_active_levels for thread %d = (%d)\n",
2687                   gtid, max_active_levels));
2688     return;
2689   }
2690   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2691     // it's OK, the max_active_levels is within the valid range: [ 0;
2692     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2693     // We allow a zero value. (implementation defined behavior)
2694   } else {
2695     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2696                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2697     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2698     // Current upper limit is MAX_INT. (implementation defined behavior)
2699     // If the input exceeds the upper limit, we correct the input to be the
2700     // upper limit. (implementation defined behavior)
2701     // Actually, the flow should never get here until we use MAX_INT limit.
2702   }
2703   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2704                 "max_active_levels for thread %d = (%d)\n",
2705                 gtid, max_active_levels));
2706 
2707   thread = __kmp_threads[gtid];
2708 
2709   __kmp_save_internal_controls(thread);
2710 
2711   set__max_active_levels(thread, max_active_levels);
2712 }
2713 
2714 /* Gets max_active_levels */
2715 int __kmp_get_max_active_levels(int gtid) {
2716   kmp_info_t *thread;
2717 
2718   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2719   KMP_DEBUG_ASSERT(__kmp_init_serial);
2720 
2721   thread = __kmp_threads[gtid];
2722   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2723   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2724                 "curtask_maxaclevel=%d\n",
2725                 gtid, thread->th.th_current_task,
2726                 thread->th.th_current_task->td_icvs.max_active_levels));
2727   return thread->th.th_current_task->td_icvs.max_active_levels;
2728 }
2729 
2730 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2731 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2732   kmp_info_t *thread;
2733   //    kmp_team_t *team;
2734 
2735   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2736                 gtid, (int)kind, chunk));
2737   KMP_DEBUG_ASSERT(__kmp_init_serial);
2738 
2739   // Check if the kind parameter is valid, correct if needed.
2740   // Valid parameters should fit in one of two intervals - standard or extended:
2741   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2742   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2743   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2744       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2745     // TODO: Hint needs attention in case we change the default schedule.
2746     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2747               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2748               __kmp_msg_null);
2749     kind = kmp_sched_default;
2750     chunk = 0; // ignore chunk value in case of bad kind
2751   }
2752 
2753   thread = __kmp_threads[gtid];
2754 
2755   __kmp_save_internal_controls(thread);
2756 
2757   if (kind < kmp_sched_upper_std) {
2758     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2759       // differ static chunked vs. unchunked:  chunk should be invalid to
2760       // indicate unchunked schedule (which is the default)
2761       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2762     } else {
2763       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2764           __kmp_sch_map[kind - kmp_sched_lower - 1];
2765     }
2766   } else {
2767     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2768     //    kmp_sched_lower - 2 ];
2769     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2770         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2771                       kmp_sched_lower - 2];
2772   }
2773   if (kind == kmp_sched_auto || chunk < 1) {
2774     // ignore parameter chunk for schedule auto
2775     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2776   } else {
2777     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2778   }
2779 }
2780 
2781 /* Gets def_sched_var ICV values */
2782 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2783   kmp_info_t *thread;
2784   enum sched_type th_type;
2785 
2786   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2787   KMP_DEBUG_ASSERT(__kmp_init_serial);
2788 
2789   thread = __kmp_threads[gtid];
2790 
2791   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2792 
2793   switch (th_type) {
2794   case kmp_sch_static:
2795   case kmp_sch_static_greedy:
2796   case kmp_sch_static_balanced:
2797     *kind = kmp_sched_static;
2798     *chunk = 0; // chunk was not set, try to show this fact via zero value
2799     return;
2800   case kmp_sch_static_chunked:
2801     *kind = kmp_sched_static;
2802     break;
2803   case kmp_sch_dynamic_chunked:
2804     *kind = kmp_sched_dynamic;
2805     break;
2806   case kmp_sch_guided_chunked:
2807   case kmp_sch_guided_iterative_chunked:
2808   case kmp_sch_guided_analytical_chunked:
2809     *kind = kmp_sched_guided;
2810     break;
2811   case kmp_sch_auto:
2812     *kind = kmp_sched_auto;
2813     break;
2814   case kmp_sch_trapezoidal:
2815     *kind = kmp_sched_trapezoidal;
2816     break;
2817 #if KMP_STATIC_STEAL_ENABLED
2818   case kmp_sch_static_steal:
2819     *kind = kmp_sched_static_steal;
2820     break;
2821 #endif
2822   default:
2823     KMP_FATAL(UnknownSchedulingType, th_type);
2824   }
2825 
2826   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2827 }
2828 
2829 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2830 
2831   int ii, dd;
2832   kmp_team_t *team;
2833   kmp_info_t *thr;
2834 
2835   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2836   KMP_DEBUG_ASSERT(__kmp_init_serial);
2837 
2838   // validate level
2839   if (level == 0)
2840     return 0;
2841   if (level < 0)
2842     return -1;
2843   thr = __kmp_threads[gtid];
2844   team = thr->th.th_team;
2845   ii = team->t.t_level;
2846   if (level > ii)
2847     return -1;
2848 
2849 #if OMP_40_ENABLED
2850   if (thr->th.th_teams_microtask) {
2851     // AC: we are in teams region where multiple nested teams have same level
2852     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2853     if (level <=
2854         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2855       KMP_DEBUG_ASSERT(ii >= tlevel);
2856       // AC: As we need to pass by the teams league, we need to artificially
2857       // increase ii
2858       if (ii == tlevel) {
2859         ii += 2; // three teams have same level
2860       } else {
2861         ii++; // two teams have same level
2862       }
2863     }
2864   }
2865 #endif
2866 
2867   if (ii == level)
2868     return __kmp_tid_from_gtid(gtid);
2869 
2870   dd = team->t.t_serialized;
2871   level++;
2872   while (ii > level) {
2873     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2874     }
2875     if ((team->t.t_serialized) && (!dd)) {
2876       team = team->t.t_parent;
2877       continue;
2878     }
2879     if (ii > level) {
2880       team = team->t.t_parent;
2881       dd = team->t.t_serialized;
2882       ii--;
2883     }
2884   }
2885 
2886   return (dd > 1) ? (0) : (team->t.t_master_tid);
2887 }
2888 
2889 int __kmp_get_team_size(int gtid, int level) {
2890 
2891   int ii, dd;
2892   kmp_team_t *team;
2893   kmp_info_t *thr;
2894 
2895   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2896   KMP_DEBUG_ASSERT(__kmp_init_serial);
2897 
2898   // validate level
2899   if (level == 0)
2900     return 1;
2901   if (level < 0)
2902     return -1;
2903   thr = __kmp_threads[gtid];
2904   team = thr->th.th_team;
2905   ii = team->t.t_level;
2906   if (level > ii)
2907     return -1;
2908 
2909 #if OMP_40_ENABLED
2910   if (thr->th.th_teams_microtask) {
2911     // AC: we are in teams region where multiple nested teams have same level
2912     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2913     if (level <=
2914         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2915       KMP_DEBUG_ASSERT(ii >= tlevel);
2916       // AC: As we need to pass by the teams league, we need to artificially
2917       // increase ii
2918       if (ii == tlevel) {
2919         ii += 2; // three teams have same level
2920       } else {
2921         ii++; // two teams have same level
2922       }
2923     }
2924   }
2925 #endif
2926 
2927   while (ii > level) {
2928     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2929     }
2930     if (team->t.t_serialized && (!dd)) {
2931       team = team->t.t_parent;
2932       continue;
2933     }
2934     if (ii > level) {
2935       team = team->t.t_parent;
2936       ii--;
2937     }
2938   }
2939 
2940   return team->t.t_nproc;
2941 }
2942 
2943 kmp_r_sched_t __kmp_get_schedule_global() {
2944   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2945   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2946   // independently. So one can get the updated schedule here.
2947 
2948   kmp_r_sched_t r_sched;
2949 
2950   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2951   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2952   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2953   // different roots (even in OMP 2.5)
2954   if (__kmp_sched == kmp_sch_static) {
2955     r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed
2956     // schedule (balanced or greedy)
2957   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2958     r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed
2959     // schedule (iterative or analytical)
2960   } else {
2961     r_sched.r_sched_type =
2962         __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2963   }
2964 
2965   if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it
2966     // was not ever set)
2967     r_sched.chunk = KMP_DEFAULT_CHUNK;
2968   } else {
2969     r_sched.chunk = __kmp_chunk;
2970   }
2971 
2972   return r_sched;
2973 }
2974 
2975 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2976    at least argc number of *t_argv entries for the requested team. */
2977 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2978 
2979   KMP_DEBUG_ASSERT(team);
2980   if (!realloc || argc > team->t.t_max_argc) {
2981 
2982     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2983                    "current entries=%d\n",
2984                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2985     /* if previously allocated heap space for args, free them */
2986     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2987       __kmp_free((void *)team->t.t_argv);
2988 
2989     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2990       /* use unused space in the cache line for arguments */
2991       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2992       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2993                      "argv entries\n",
2994                      team->t.t_id, team->t.t_max_argc));
2995       team->t.t_argv = &team->t.t_inline_argv[0];
2996       if (__kmp_storage_map) {
2997         __kmp_print_storage_map_gtid(
2998             -1, &team->t.t_inline_argv[0],
2999             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3000             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3001             team->t.t_id);
3002       }
3003     } else {
3004       /* allocate space for arguments in the heap */
3005       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3006                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3007                                : 2 * argc;
3008       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3009                      "argv entries\n",
3010                      team->t.t_id, team->t.t_max_argc));
3011       team->t.t_argv =
3012           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3013       if (__kmp_storage_map) {
3014         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3015                                      &team->t.t_argv[team->t.t_max_argc],
3016                                      sizeof(void *) * team->t.t_max_argc,
3017                                      "team_%d.t_argv", team->t.t_id);
3018       }
3019     }
3020   }
3021 }
3022 
3023 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3024   int i;
3025   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3026   team->t.t_threads =
3027       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3028   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3029       sizeof(dispatch_shared_info_t) * num_disp_buff);
3030   team->t.t_dispatch =
3031       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3032   team->t.t_implicit_task_taskdata =
3033       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3034   team->t.t_max_nproc = max_nth;
3035 
3036   /* setup dispatch buffers */
3037   for (i = 0; i < num_disp_buff; ++i) {
3038     team->t.t_disp_buffer[i].buffer_index = i;
3039 #if OMP_45_ENABLED
3040     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3041 #endif
3042   }
3043 }
3044 
3045 static void __kmp_free_team_arrays(kmp_team_t *team) {
3046   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3047   int i;
3048   for (i = 0; i < team->t.t_max_nproc; ++i) {
3049     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3050       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3051       team->t.t_dispatch[i].th_disp_buffer = NULL;
3052     }
3053   }
3054   __kmp_free(team->t.t_threads);
3055   __kmp_free(team->t.t_disp_buffer);
3056   __kmp_free(team->t.t_dispatch);
3057   __kmp_free(team->t.t_implicit_task_taskdata);
3058   team->t.t_threads = NULL;
3059   team->t.t_disp_buffer = NULL;
3060   team->t.t_dispatch = NULL;
3061   team->t.t_implicit_task_taskdata = 0;
3062 }
3063 
3064 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3065   kmp_info_t **oldThreads = team->t.t_threads;
3066 
3067   __kmp_free(team->t.t_disp_buffer);
3068   __kmp_free(team->t.t_dispatch);
3069   __kmp_free(team->t.t_implicit_task_taskdata);
3070   __kmp_allocate_team_arrays(team, max_nth);
3071 
3072   KMP_MEMCPY(team->t.t_threads, oldThreads,
3073              team->t.t_nproc * sizeof(kmp_info_t *));
3074 
3075   __kmp_free(oldThreads);
3076 }
3077 
3078 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3079 
3080   kmp_r_sched_t r_sched =
3081       __kmp_get_schedule_global(); // get current state of scheduling globals
3082 
3083 #if OMP_40_ENABLED
3084   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3085 #endif /* OMP_40_ENABLED */
3086 
3087   kmp_internal_control_t g_icvs = {
3088     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3089     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3090     // for nested parallelism (per thread)
3091     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3092     // adjustment of threads (per thread)
3093     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3094     // whether blocktime is explicitly set
3095     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3096 #if KMP_USE_MONITOR
3097     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3098 // intervals
3099 #endif
3100     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3101     // next parallel region (per thread)
3102     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3103     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3104     // for max_active_levels
3105     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3106 // {sched,chunk} pair
3107 #if OMP_40_ENABLED
3108     __kmp_nested_proc_bind.bind_types[0],
3109     __kmp_default_device,
3110 #endif /* OMP_40_ENABLED */
3111     NULL // struct kmp_internal_control *next;
3112   };
3113 
3114   return g_icvs;
3115 }
3116 
3117 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3118 
3119   kmp_internal_control_t gx_icvs;
3120   gx_icvs.serial_nesting_level =
3121       0; // probably =team->t.t_serial like in save_inter_controls
3122   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3123   gx_icvs.next = NULL;
3124 
3125   return gx_icvs;
3126 }
3127 
3128 static void __kmp_initialize_root(kmp_root_t *root) {
3129   int f;
3130   kmp_team_t *root_team;
3131   kmp_team_t *hot_team;
3132   int hot_team_max_nth;
3133   kmp_r_sched_t r_sched =
3134       __kmp_get_schedule_global(); // get current state of scheduling globals
3135   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3136   KMP_DEBUG_ASSERT(root);
3137   KMP_ASSERT(!root->r.r_begin);
3138 
3139   /* setup the root state structure */
3140   __kmp_init_lock(&root->r.r_begin_lock);
3141   root->r.r_begin = FALSE;
3142   root->r.r_active = FALSE;
3143   root->r.r_in_parallel = 0;
3144   root->r.r_blocktime = __kmp_dflt_blocktime;
3145   root->r.r_nested = __kmp_dflt_nested;
3146   root->r.r_cg_nthreads = 1;
3147 
3148   /* setup the root team for this task */
3149   /* allocate the root team structure */
3150   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3151 
3152   root_team =
3153       __kmp_allocate_team(root,
3154                           1, // new_nproc
3155                           1, // max_nproc
3156 #if OMPT_SUPPORT
3157                           0, // root parallel id
3158 #endif
3159 #if OMP_40_ENABLED
3160                           __kmp_nested_proc_bind.bind_types[0],
3161 #endif
3162                           &r_icvs,
3163                           0 // argc
3164                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3165                           );
3166 #if USE_DEBUGGER
3167   // Non-NULL value should be assigned to make the debugger display the root
3168   // team.
3169   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3170 #endif
3171 
3172   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3173 
3174   root->r.r_root_team = root_team;
3175   root_team->t.t_control_stack_top = NULL;
3176 
3177   /* initialize root team */
3178   root_team->t.t_threads[0] = NULL;
3179   root_team->t.t_nproc = 1;
3180   root_team->t.t_serialized = 1;
3181   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3182   root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3183   root_team->t.t_sched.chunk = r_sched.chunk;
3184   KA_TRACE(
3185       20,
3186       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3187        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3188 
3189   /* setup the  hot team for this task */
3190   /* allocate the hot team structure */
3191   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3192 
3193   hot_team =
3194       __kmp_allocate_team(root,
3195                           1, // new_nproc
3196                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3197 #if OMPT_SUPPORT
3198                           0, // root parallel id
3199 #endif
3200 #if OMP_40_ENABLED
3201                           __kmp_nested_proc_bind.bind_types[0],
3202 #endif
3203                           &r_icvs,
3204                           0 // argc
3205                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3206                           );
3207   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3208 
3209   root->r.r_hot_team = hot_team;
3210   root_team->t.t_control_stack_top = NULL;
3211 
3212   /* first-time initialization */
3213   hot_team->t.t_parent = root_team;
3214 
3215   /* initialize hot team */
3216   hot_team_max_nth = hot_team->t.t_max_nproc;
3217   for (f = 0; f < hot_team_max_nth; ++f) {
3218     hot_team->t.t_threads[f] = NULL;
3219   }
3220   hot_team->t.t_nproc = 1;
3221   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3222   hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3223   hot_team->t.t_sched.chunk = r_sched.chunk;
3224   hot_team->t.t_size_changed = 0;
3225 }
3226 
3227 #ifdef KMP_DEBUG
3228 
3229 typedef struct kmp_team_list_item {
3230   kmp_team_p const *entry;
3231   struct kmp_team_list_item *next;
3232 } kmp_team_list_item_t;
3233 typedef kmp_team_list_item_t *kmp_team_list_t;
3234 
3235 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3236     kmp_team_list_t list, // List of teams.
3237     kmp_team_p const *team // Team to add.
3238     ) {
3239 
3240   // List must terminate with item where both entry and next are NULL.
3241   // Team is added to the list only once.
3242   // List is sorted in ascending order by team id.
3243   // Team id is *not* a key.
3244 
3245   kmp_team_list_t l;
3246 
3247   KMP_DEBUG_ASSERT(list != NULL);
3248   if (team == NULL) {
3249     return;
3250   }
3251 
3252   __kmp_print_structure_team_accum(list, team->t.t_parent);
3253   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3254 
3255   // Search list for the team.
3256   l = list;
3257   while (l->next != NULL && l->entry != team) {
3258     l = l->next;
3259   }
3260   if (l->next != NULL) {
3261     return; // Team has been added before, exit.
3262   }
3263 
3264   // Team is not found. Search list again for insertion point.
3265   l = list;
3266   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3267     l = l->next;
3268   }
3269 
3270   // Insert team.
3271   {
3272     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3273         sizeof(kmp_team_list_item_t));
3274     *item = *l;
3275     l->entry = team;
3276     l->next = item;
3277   }
3278 }
3279 
3280 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3281 
3282                                        ) {
3283   __kmp_printf("%s", title);
3284   if (team != NULL) {
3285     __kmp_printf("%2x %p\n", team->t.t_id, team);
3286   } else {
3287     __kmp_printf(" - (nil)\n");
3288   }
3289 }
3290 
3291 static void __kmp_print_structure_thread(char const *title,
3292                                          kmp_info_p const *thread) {
3293   __kmp_printf("%s", title);
3294   if (thread != NULL) {
3295     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3296   } else {
3297     __kmp_printf(" - (nil)\n");
3298   }
3299 }
3300 
3301 void __kmp_print_structure(void) {
3302 
3303   kmp_team_list_t list;
3304 
3305   // Initialize list of teams.
3306   list =
3307       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3308   list->entry = NULL;
3309   list->next = NULL;
3310 
3311   __kmp_printf("\n------------------------------\nGlobal Thread "
3312                "Table\n------------------------------\n");
3313   {
3314     int gtid;
3315     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3316       __kmp_printf("%2d", gtid);
3317       if (__kmp_threads != NULL) {
3318         __kmp_printf(" %p", __kmp_threads[gtid]);
3319       }
3320       if (__kmp_root != NULL) {
3321         __kmp_printf(" %p", __kmp_root[gtid]);
3322       }
3323       __kmp_printf("\n");
3324     }
3325   }
3326 
3327   // Print out __kmp_threads array.
3328   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3329                "----------\n");
3330   if (__kmp_threads != NULL) {
3331     int gtid;
3332     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3333       kmp_info_t const *thread = __kmp_threads[gtid];
3334       if (thread != NULL) {
3335         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3336         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3337         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3338         __kmp_print_structure_team("    Serial Team:  ",
3339                                    thread->th.th_serial_team);
3340         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3341         __kmp_print_structure_thread("    Master:       ",
3342                                      thread->th.th_team_master);
3343         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3344         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3345 #if OMP_40_ENABLED
3346         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3347 #endif
3348         __kmp_print_structure_thread("    Next in pool: ",
3349                                      thread->th.th_next_pool);
3350         __kmp_printf("\n");
3351         __kmp_print_structure_team_accum(list, thread->th.th_team);
3352         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3353       }
3354     }
3355   } else {
3356     __kmp_printf("Threads array is not allocated.\n");
3357   }
3358 
3359   // Print out __kmp_root array.
3360   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3361                "--------\n");
3362   if (__kmp_root != NULL) {
3363     int gtid;
3364     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3365       kmp_root_t const *root = __kmp_root[gtid];
3366       if (root != NULL) {
3367         __kmp_printf("GTID %2d %p:\n", gtid, root);
3368         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3369         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3370         __kmp_print_structure_thread("    Uber Thread:  ",
3371                                      root->r.r_uber_thread);
3372         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3373         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3374         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3375         __kmp_printf("\n");
3376         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3377         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3378       }
3379     }
3380   } else {
3381     __kmp_printf("Ubers array is not allocated.\n");
3382   }
3383 
3384   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3385                "--------\n");
3386   while (list->next != NULL) {
3387     kmp_team_p const *team = list->entry;
3388     int i;
3389     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3390     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3391     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3392     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3393     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3394     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3395     for (i = 0; i < team->t.t_nproc; ++i) {
3396       __kmp_printf("    Thread %2d:      ", i);
3397       __kmp_print_structure_thread("", team->t.t_threads[i]);
3398     }
3399     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3400     __kmp_printf("\n");
3401     list = list->next;
3402   }
3403 
3404   // Print out __kmp_thread_pool and __kmp_team_pool.
3405   __kmp_printf("\n------------------------------\nPools\n----------------------"
3406                "--------\n");
3407   __kmp_print_structure_thread("Thread pool:          ",
3408                                CCAST(kmp_info_t *, __kmp_thread_pool));
3409   __kmp_print_structure_team("Team pool:            ",
3410                              CCAST(kmp_team_t *, __kmp_team_pool));
3411   __kmp_printf("\n");
3412 
3413   // Free team list.
3414   while (list != NULL) {
3415     kmp_team_list_item_t *item = list;
3416     list = list->next;
3417     KMP_INTERNAL_FREE(item);
3418   }
3419 }
3420 
3421 #endif
3422 
3423 //---------------------------------------------------------------------------
3424 //  Stuff for per-thread fast random number generator
3425 //  Table of primes
3426 static const unsigned __kmp_primes[] = {
3427     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3428     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3429     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3430     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3431     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3432     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3433     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3434     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3435     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3436     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3437     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3438 
3439 //---------------------------------------------------------------------------
3440 //  __kmp_get_random: Get a random number using a linear congruential method.
3441 unsigned short __kmp_get_random(kmp_info_t *thread) {
3442   unsigned x = thread->th.th_x;
3443   unsigned short r = x >> 16;
3444 
3445   thread->th.th_x = x * thread->th.th_a + 1;
3446 
3447   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3448                 thread->th.th_info.ds.ds_tid, r));
3449 
3450   return r;
3451 }
3452 //--------------------------------------------------------
3453 // __kmp_init_random: Initialize a random number generator
3454 void __kmp_init_random(kmp_info_t *thread) {
3455   unsigned seed = thread->th.th_info.ds.ds_tid;
3456 
3457   thread->th.th_a =
3458       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3459   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3460   KA_TRACE(30,
3461            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3462 }
3463 
3464 #if KMP_OS_WINDOWS
3465 /* reclaim array entries for root threads that are already dead, returns number
3466  * reclaimed */
3467 static int __kmp_reclaim_dead_roots(void) {
3468   int i, r = 0;
3469 
3470   for (i = 0; i < __kmp_threads_capacity; ++i) {
3471     if (KMP_UBER_GTID(i) &&
3472         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3473         !__kmp_root[i]
3474              ->r.r_active) { // AC: reclaim only roots died in non-active state
3475       r += __kmp_unregister_root_other_thread(i);
3476     }
3477   }
3478   return r;
3479 }
3480 #endif
3481 
3482 /* This function attempts to create free entries in __kmp_threads and
3483    __kmp_root, and returns the number of free entries generated.
3484 
3485    For Windows* OS static library, the first mechanism used is to reclaim array
3486    entries for root threads that are already dead.
3487 
3488    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3489    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3490    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3491    threadprivate cache array has been created. Synchronization with
3492    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3493 
3494    After any dead root reclamation, if the clipping value allows array expansion
3495    to result in the generation of a total of nWish free slots, the function does
3496    that expansion. If not, but the clipping value allows array expansion to
3497    result in the generation of a total of nNeed free slots, the function does
3498    that expansion. Otherwise, nothing is done beyond the possible initial root
3499    thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3500    to fulfil nWish as far as possible, i.e. the function will attempt to create
3501    as many free slots as possible up to nWish.
3502 
3503    If any argument is negative, the behavior is undefined. */
3504 static int __kmp_expand_threads(int nWish, int nNeed) {
3505   int added = 0;
3506   int old_tp_cached;
3507   int __kmp_actual_max_nth;
3508 
3509   if (nNeed > nWish) /* normalize the arguments */
3510     nWish = nNeed;
3511 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3512   /* only for Windows static library */
3513   /* reclaim array entries for root threads that are already dead */
3514   added = __kmp_reclaim_dead_roots();
3515 
3516   if (nNeed) {
3517     nNeed -= added;
3518     if (nNeed < 0)
3519       nNeed = 0;
3520   }
3521   if (nWish) {
3522     nWish -= added;
3523     if (nWish < 0)
3524       nWish = 0;
3525   }
3526 #endif
3527   if (nWish <= 0)
3528     return added;
3529 
3530   while (1) {
3531     int nTarget;
3532     int minimumRequiredCapacity;
3533     int newCapacity;
3534     kmp_info_t **newThreads;
3535     kmp_root_t **newRoot;
3536 
3537     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3538     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3539     // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3540     // > __kmp_max_nth in one of two ways:
3541     //
3542     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3543     //    may not be resused by another thread, so we may need to increase
3544     //    __kmp_threads_capacity to __kmp_max_nth + 1.
3545     //
3546     // 2) New foreign root(s) are encountered.  We always register new foreign
3547     //    roots. This may cause a smaller # of threads to be allocated at
3548     //    subsequent parallel regions, but the worker threads hang around (and
3549     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3550     //
3551     // Anyway, that is the reason for moving the check to see if
3552     // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3553     // instead of having it performed here. -BB
3554     old_tp_cached = __kmp_tp_cached;
3555     __kmp_actual_max_nth =
3556         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3557     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3558 
3559     /* compute expansion headroom to check if we can expand and whether to aim
3560        for nWish or nNeed */
3561     nTarget = nWish;
3562     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3563       /* can't fulfil nWish, so try nNeed */
3564       if (nNeed) {
3565         nTarget = nNeed;
3566         if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3567           /* possible expansion too small -- give up */
3568           break;
3569         }
3570       } else {
3571         /* best-effort */
3572         nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3573         if (!nTarget) {
3574           /* can expand at all -- give up */
3575           break;
3576         }
3577       }
3578     }
3579     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3580 
3581     newCapacity = __kmp_threads_capacity;
3582     do {
3583       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3584                         ? (newCapacity << 1)
3585                         : __kmp_actual_max_nth;
3586     } while (newCapacity < minimumRequiredCapacity);
3587     newThreads = (kmp_info_t **)__kmp_allocate(
3588         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3589         CACHE_LINE);
3590     newRoot = (kmp_root_t **)((char *)newThreads +
3591                               sizeof(kmp_info_t *) * newCapacity);
3592     KMP_MEMCPY(newThreads, __kmp_threads,
3593                __kmp_threads_capacity * sizeof(kmp_info_t *));
3594     KMP_MEMCPY(newRoot, __kmp_root,
3595                __kmp_threads_capacity * sizeof(kmp_root_t *));
3596     memset(newThreads + __kmp_threads_capacity, 0,
3597            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3598     memset(newRoot + __kmp_threads_capacity, 0,
3599            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3600 
3601     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3602       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3603          allocated a threadprivate cache while we were allocating the expanded
3604          array, and our new capacity is larger than the threadprivate cache
3605          capacity, so we should deallocate the expanded arrays and try again.
3606          This is the first check of a double-check pair. */
3607       __kmp_free(newThreads);
3608       continue; /* start over and try again */
3609     }
3610     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3611     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3612       /* Same check as above, but this time with the lock so we can be sure if
3613          we can succeed. */
3614       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3615       __kmp_free(newThreads);
3616       continue; /* start over and try again */
3617     } else {
3618       /* success */
3619       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3620       // investigated.
3621       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3622       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3623       added += newCapacity - __kmp_threads_capacity;
3624       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3625       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3626       break; /* succeeded, so we can exit the loop */
3627     }
3628   }
3629   return added;
3630 }
3631 
3632 /* Register the current thread as a root thread and obtain our gtid. We must
3633    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3634    thread that calls from __kmp_do_serial_initialize() */
3635 int __kmp_register_root(int initial_thread) {
3636   kmp_info_t *root_thread;
3637   kmp_root_t *root;
3638   int gtid;
3639   int capacity;
3640   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3641   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3642   KMP_MB();
3643 
3644   /* 2007-03-02:
3645      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3646      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3647      work as expected -- it may return false (that means there is at least one
3648      empty slot in __kmp_threads array), but it is possible the only free slot
3649      is #0, which is reserved for initial thread and so cannot be used for this
3650      one. Following code workarounds this bug.
3651 
3652      However, right solution seems to be not reserving slot #0 for initial
3653      thread because:
3654      (1) there is no magic in slot #0,
3655      (2) we cannot detect initial thread reliably (the first thread which does
3656         serial initialization may be not a real initial thread).
3657   */
3658   capacity = __kmp_threads_capacity;
3659   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3660     --capacity;
3661   }
3662 
3663   /* see if there are too many threads */
3664   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3665     if (__kmp_tp_cached) {
3666       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3667                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3668                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3669     } else {
3670       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3671                   __kmp_msg_null);
3672     }
3673   }
3674 
3675   /* find an available thread slot */
3676   /* Don't reassign the zero slot since we need that to only be used by initial
3677      thread */
3678   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3679        gtid++)
3680     ;
3681   KA_TRACE(1,
3682            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3683   KMP_ASSERT(gtid < __kmp_threads_capacity);
3684 
3685   /* update global accounting */
3686   __kmp_all_nth++;
3687   TCW_4(__kmp_nth, __kmp_nth + 1);
3688 
3689   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3690   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3691   if (__kmp_adjust_gtid_mode) {
3692     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3693       if (TCR_4(__kmp_gtid_mode) != 2) {
3694         TCW_4(__kmp_gtid_mode, 2);
3695       }
3696     } else {
3697       if (TCR_4(__kmp_gtid_mode) != 1) {
3698         TCW_4(__kmp_gtid_mode, 1);
3699       }
3700     }
3701   }
3702 
3703 #ifdef KMP_ADJUST_BLOCKTIME
3704   /* Adjust blocktime to zero if necessary            */
3705   /* Middle initialization might not have occurred yet */
3706   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3707     if (__kmp_nth > __kmp_avail_proc) {
3708       __kmp_zero_bt = TRUE;
3709     }
3710   }
3711 #endif /* KMP_ADJUST_BLOCKTIME */
3712 
3713   /* setup this new hierarchy */
3714   if (!(root = __kmp_root[gtid])) {
3715     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3716     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3717   }
3718 
3719 #if KMP_STATS_ENABLED
3720   // Initialize stats as soon as possible (right after gtid assignment).
3721   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3722   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3723   KMP_SET_THREAD_STATE(SERIAL_REGION);
3724   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3725 #endif
3726   __kmp_initialize_root(root);
3727 
3728   /* setup new root thread structure */
3729   if (root->r.r_uber_thread) {
3730     root_thread = root->r.r_uber_thread;
3731   } else {
3732     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3733     if (__kmp_storage_map) {
3734       __kmp_print_thread_storage_map(root_thread, gtid);
3735     }
3736     root_thread->th.th_info.ds.ds_gtid = gtid;
3737     root_thread->th.th_root = root;
3738     if (__kmp_env_consistency_check) {
3739       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3740     }
3741 #if USE_FAST_MEMORY
3742     __kmp_initialize_fast_memory(root_thread);
3743 #endif /* USE_FAST_MEMORY */
3744 
3745 #if KMP_USE_BGET
3746     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3747     __kmp_initialize_bget(root_thread);
3748 #endif
3749     __kmp_init_random(root_thread); // Initialize random number generator
3750   }
3751 
3752   /* setup the serial team held in reserve by the root thread */
3753   if (!root_thread->th.th_serial_team) {
3754     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3755     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3756     root_thread->th.th_serial_team =
3757         __kmp_allocate_team(root, 1, 1,
3758 #if OMPT_SUPPORT
3759                             0, // root parallel id
3760 #endif
3761 #if OMP_40_ENABLED
3762                             proc_bind_default,
3763 #endif
3764                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3765   }
3766   KMP_ASSERT(root_thread->th.th_serial_team);
3767   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3768                 root_thread->th.th_serial_team));
3769 
3770   /* drop root_thread into place */
3771   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3772 
3773   root->r.r_root_team->t.t_threads[0] = root_thread;
3774   root->r.r_hot_team->t.t_threads[0] = root_thread;
3775   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3776   // AC: the team created in reserve, not for execution (it is unused for now).
3777   root_thread->th.th_serial_team->t.t_serialized = 0;
3778   root->r.r_uber_thread = root_thread;
3779 
3780   /* initialize the thread, get it ready to go */
3781   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3782   TCW_4(__kmp_init_gtid, TRUE);
3783 
3784   /* prepare the master thread for get_gtid() */
3785   __kmp_gtid_set_specific(gtid);
3786 
3787 #if USE_ITT_BUILD
3788   __kmp_itt_thread_name(gtid);
3789 #endif /* USE_ITT_BUILD */
3790 
3791 #ifdef KMP_TDATA_GTID
3792   __kmp_gtid = gtid;
3793 #endif
3794   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3795   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3796 
3797   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3798                 "plain=%u\n",
3799                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3800                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3801                 KMP_INIT_BARRIER_STATE));
3802   { // Initialize barrier data.
3803     int b;
3804     for (b = 0; b < bs_last_barrier; ++b) {
3805       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3806 #if USE_DEBUGGER
3807       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3808 #endif
3809     }
3810   }
3811   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3812                    KMP_INIT_BARRIER_STATE);
3813 
3814 #if KMP_AFFINITY_SUPPORTED
3815 #if OMP_40_ENABLED
3816   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3817   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3818   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3819   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3820 #endif
3821 
3822   if (TCR_4(__kmp_init_middle)) {
3823     __kmp_affinity_set_init_mask(gtid, TRUE);
3824   }
3825 #endif /* KMP_AFFINITY_SUPPORTED */
3826 
3827   __kmp_root_counter++;
3828 
3829   KMP_MB();
3830   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832   return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837                                 const int max_level) {
3838   int i, n, nth;
3839   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840   if (!hot_teams || !hot_teams[level].hot_team) {
3841     return 0;
3842   }
3843   KMP_DEBUG_ASSERT(level < max_level);
3844   kmp_team_t *team = hot_teams[level].hot_team;
3845   nth = hot_teams[level].hot_team_nth;
3846   n = nth - 1; // master is not freed
3847   if (level < max_level - 1) {
3848     for (i = 0; i < nth; ++i) {
3849       kmp_info_t *th = team->t.t_threads[i];
3850       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851       if (i > 0 && th->th.th_hot_teams) {
3852         __kmp_free(th->th.th_hot_teams);
3853         th->th.th_hot_teams = NULL;
3854       }
3855     }
3856   }
3857   __kmp_free_team(root, team, NULL);
3858   return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865   kmp_team_t *root_team = root->r.r_root_team;
3866   kmp_team_t *hot_team = root->r.r_hot_team;
3867   int n = hot_team->t.t_nproc;
3868   int i;
3869 
3870   KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872   root->r.r_root_team = NULL;
3873   root->r.r_hot_team = NULL;
3874   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875   // before call to __kmp_free_team().
3876   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878   if (__kmp_hot_teams_max_level >
3879       0) { // need to free nested hot teams and their threads if any
3880     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881       kmp_info_t *th = hot_team->t.t_threads[i];
3882       if (__kmp_hot_teams_max_level > 1) {
3883         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884       }
3885       if (th->th.th_hot_teams) {
3886         __kmp_free(th->th.th_hot_teams);
3887         th->th.th_hot_teams = NULL;
3888       }
3889     }
3890   }
3891 #endif
3892   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894   // Before we can reap the thread, we need to make certain that all other
3895   // threads in the teams that had this root as ancestor have stopped trying to
3896   // steal tasks.
3897   if (__kmp_tasking_mode != tskm_immediate_exec) {
3898     __kmp_wait_to_unref_task_teams();
3899   }
3900 
3901 #if KMP_OS_WINDOWS
3902   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903   KA_TRACE(
3904       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905            "\n",
3906            (LPVOID) & (root->r.r_uber_thread->th),
3907            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3913     int gtid = __kmp_get_gtid();
3914     __ompt_thread_end(ompt_thread_initial, gtid);
3915   }
3916 #endif
3917 
3918   TCW_4(__kmp_nth,
3919         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3920   root->r.r_cg_nthreads--;
3921 
3922   __kmp_reap_thread(root->r.r_uber_thread, 1);
3923 
3924   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3925   // of freeing.
3926   root->r.r_uber_thread = NULL;
3927   /* mark root as no longer in use */
3928   root->r.r_begin = FALSE;
3929 
3930   return n;
3931 }
3932 
3933 void __kmp_unregister_root_current_thread(int gtid) {
3934   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3935   /* this lock should be ok, since unregister_root_current_thread is never
3936      called during an abort, only during a normal close. furthermore, if you
3937      have the forkjoin lock, you should never try to get the initz lock */
3938   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3939   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3940     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3941                   "exiting T#%d\n",
3942                   gtid));
3943     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3944     return;
3945   }
3946   kmp_root_t *root = __kmp_root[gtid];
3947 
3948   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3949   KMP_ASSERT(KMP_UBER_GTID(gtid));
3950   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3951   KMP_ASSERT(root->r.r_active == FALSE);
3952 
3953   KMP_MB();
3954 
3955 #if OMP_45_ENABLED
3956   kmp_info_t *thread = __kmp_threads[gtid];
3957   kmp_team_t *team = thread->th.th_team;
3958   kmp_task_team_t *task_team = thread->th.th_task_team;
3959 
3960   // we need to wait for the proxy tasks before finishing the thread
3961   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3962 #if OMPT_SUPPORT
3963     // the runtime is shutting down so we won't report any events
3964     thread->th.ompt_thread_info.state = ompt_state_undefined;
3965 #endif
3966     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3967   }
3968 #endif
3969 
3970   __kmp_reset_root(gtid, root);
3971 
3972   /* free up this thread slot */
3973   __kmp_gtid_set_specific(KMP_GTID_DNE);
3974 #ifdef KMP_TDATA_GTID
3975   __kmp_gtid = KMP_GTID_DNE;
3976 #endif
3977 
3978   KMP_MB();
3979   KC_TRACE(10,
3980            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3981 
3982   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3983 }
3984 
3985 #if KMP_OS_WINDOWS
3986 /* __kmp_forkjoin_lock must be already held
3987    Unregisters a root thread that is not the current thread.  Returns the number
3988    of __kmp_threads entries freed as a result. */
3989 static int __kmp_unregister_root_other_thread(int gtid) {
3990   kmp_root_t *root = __kmp_root[gtid];
3991   int r;
3992 
3993   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3994   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3995   KMP_ASSERT(KMP_UBER_GTID(gtid));
3996   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3997   KMP_ASSERT(root->r.r_active == FALSE);
3998 
3999   r = __kmp_reset_root(gtid, root);
4000   KC_TRACE(10,
4001            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4002   return r;
4003 }
4004 #endif
4005 
4006 #if KMP_DEBUG
4007 void __kmp_task_info() {
4008 
4009   kmp_int32 gtid = __kmp_entry_gtid();
4010   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4011   kmp_info_t *this_thr = __kmp_threads[gtid];
4012   kmp_team_t *steam = this_thr->th.th_serial_team;
4013   kmp_team_t *team = this_thr->th.th_team;
4014 
4015   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4016                "ptask=%p\n",
4017                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4018                team->t.t_implicit_task_taskdata[tid].td_parent);
4019 }
4020 #endif // KMP_DEBUG
4021 
4022 /* TODO optimize with one big memclr, take out what isn't needed, split
4023    responsibility to workers as much as possible, and delay initialization of
4024    features as much as possible  */
4025 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4026                                   int tid, int gtid) {
4027   /* this_thr->th.th_info.ds.ds_gtid is setup in
4028      kmp_allocate_thread/create_worker.
4029      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4030   kmp_info_t *master = team->t.t_threads[0];
4031   KMP_DEBUG_ASSERT(this_thr != NULL);
4032   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4033   KMP_DEBUG_ASSERT(team);
4034   KMP_DEBUG_ASSERT(team->t.t_threads);
4035   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4036   KMP_DEBUG_ASSERT(master);
4037   KMP_DEBUG_ASSERT(master->th.th_root);
4038 
4039   KMP_MB();
4040 
4041   TCW_SYNC_PTR(this_thr->th.th_team, team);
4042 
4043   this_thr->th.th_info.ds.ds_tid = tid;
4044   this_thr->th.th_set_nproc = 0;
4045   if (__kmp_tasking_mode != tskm_immediate_exec)
4046     // When tasking is possible, threads are not safe to reap until they are
4047     // done tasking; this will be set when tasking code is exited in wait
4048     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4049   else // no tasking --> always safe to reap
4050     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4051 #if OMP_40_ENABLED
4052   this_thr->th.th_set_proc_bind = proc_bind_default;
4053 #if KMP_AFFINITY_SUPPORTED
4054   this_thr->th.th_new_place = this_thr->th.th_current_place;
4055 #endif
4056 #endif
4057   this_thr->th.th_root = master->th.th_root;
4058 
4059   /* setup the thread's cache of the team structure */
4060   this_thr->th.th_team_nproc = team->t.t_nproc;
4061   this_thr->th.th_team_master = master;
4062   this_thr->th.th_team_serialized = team->t.t_serialized;
4063   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4064 
4065   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4066 
4067   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4068                 tid, gtid, this_thr, this_thr->th.th_current_task));
4069 
4070   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4071                            team, tid, TRUE);
4072 
4073   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4074                 tid, gtid, this_thr, this_thr->th.th_current_task));
4075   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4076   // __kmp_initialize_team()?
4077 
4078   /* TODO no worksharing in speculative threads */
4079   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4080 
4081   this_thr->th.th_local.this_construct = 0;
4082 
4083   if (!this_thr->th.th_pri_common) {
4084     this_thr->th.th_pri_common =
4085         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4086     if (__kmp_storage_map) {
4087       __kmp_print_storage_map_gtid(
4088           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4089           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4090     }
4091     this_thr->th.th_pri_head = NULL;
4092   }
4093 
4094   /* Initialize dynamic dispatch */
4095   {
4096     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4097     // Use team max_nproc since this will never change for the team.
4098     size_t disp_size =
4099         sizeof(dispatch_private_info_t) *
4100         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4101     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4102                   team->t.t_max_nproc));
4103     KMP_ASSERT(dispatch);
4104     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4105     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4106 
4107     dispatch->th_disp_index = 0;
4108 #if OMP_45_ENABLED
4109     dispatch->th_doacross_buf_idx = 0;
4110 #endif
4111     if (!dispatch->th_disp_buffer) {
4112       dispatch->th_disp_buffer =
4113           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4114 
4115       if (__kmp_storage_map) {
4116         __kmp_print_storage_map_gtid(
4117             gtid, &dispatch->th_disp_buffer[0],
4118             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4119                                           ? 1
4120                                           : __kmp_dispatch_num_buffers],
4121             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4122                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4123             gtid, team->t.t_id, gtid);
4124       }
4125     } else {
4126       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4127     }
4128 
4129     dispatch->th_dispatch_pr_current = 0;
4130     dispatch->th_dispatch_sh_current = 0;
4131 
4132     dispatch->th_deo_fcn = 0; /* ORDERED     */
4133     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4134   }
4135 
4136   this_thr->th.th_next_pool = NULL;
4137 
4138   if (!this_thr->th.th_task_state_memo_stack) {
4139     size_t i;
4140     this_thr->th.th_task_state_memo_stack =
4141         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4142     this_thr->th.th_task_state_top = 0;
4143     this_thr->th.th_task_state_stack_sz = 4;
4144     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4145          ++i) // zero init the stack
4146       this_thr->th.th_task_state_memo_stack[i] = 0;
4147   }
4148 
4149   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4150   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4151 
4152   KMP_MB();
4153 }
4154 
4155 /* allocate a new thread for the requesting team. this is only called from
4156    within a forkjoin critical section. we will first try to get an available
4157    thread from the thread pool. if none is available, we will fork a new one
4158    assuming we are able to create a new one. this should be assured, as the
4159    caller should check on this first. */
4160 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4161                                   int new_tid) {
4162   kmp_team_t *serial_team;
4163   kmp_info_t *new_thr;
4164   int new_gtid;
4165 
4166   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4167   KMP_DEBUG_ASSERT(root && team);
4168 #if !KMP_NESTED_HOT_TEAMS
4169   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4170 #endif
4171   KMP_MB();
4172 
4173   /* first, try to get one from the thread pool */
4174   if (__kmp_thread_pool) {
4175 
4176     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4177     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4178     if (new_thr == __kmp_thread_pool_insert_pt) {
4179       __kmp_thread_pool_insert_pt = NULL;
4180     }
4181     TCW_4(new_thr->th.th_in_pool, FALSE);
4182     // Don't touch th_active_in_pool or th_active.
4183     // The worker thread adjusts those flags as it sleeps/awakens.
4184     __kmp_thread_pool_nth--;
4185 
4186     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4187                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4188     KMP_ASSERT(!new_thr->th.th_team);
4189     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4190     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4191 
4192     /* setup the thread structure */
4193     __kmp_initialize_info(new_thr, team, new_tid,
4194                           new_thr->th.th_info.ds.ds_gtid);
4195     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4196 
4197     TCW_4(__kmp_nth, __kmp_nth + 1);
4198     root->r.r_cg_nthreads++;
4199 
4200     new_thr->th.th_task_state = 0;
4201     new_thr->th.th_task_state_top = 0;
4202     new_thr->th.th_task_state_stack_sz = 4;
4203 
4204 #ifdef KMP_ADJUST_BLOCKTIME
4205     /* Adjust blocktime back to zero if necessary */
4206     /* Middle initialization might not have occurred yet */
4207     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4208       if (__kmp_nth > __kmp_avail_proc) {
4209         __kmp_zero_bt = TRUE;
4210       }
4211     }
4212 #endif /* KMP_ADJUST_BLOCKTIME */
4213 
4214 #if KMP_DEBUG
4215     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4216     // KMP_BARRIER_PARENT_FLAG.
4217     int b;
4218     kmp_balign_t *balign = new_thr->th.th_bar;
4219     for (b = 0; b < bs_last_barrier; ++b)
4220       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4221 #endif
4222 
4223     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4224                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4225 
4226     KMP_MB();
4227     return new_thr;
4228   }
4229 
4230   /* no, well fork a new one */
4231   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4232   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4233 
4234 #if KMP_USE_MONITOR
4235   // If this is the first worker thread the RTL is creating, then also
4236   // launch the monitor thread.  We try to do this as early as possible.
4237   if (!TCR_4(__kmp_init_monitor)) {
4238     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4239     if (!TCR_4(__kmp_init_monitor)) {
4240       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4241       TCW_4(__kmp_init_monitor, 1);
4242       __kmp_create_monitor(&__kmp_monitor);
4243       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4244 #if KMP_OS_WINDOWS
4245       // AC: wait until monitor has started. This is a fix for CQ232808.
4246       // The reason is that if the library is loaded/unloaded in a loop with
4247       // small (parallel) work in between, then there is high probability that
4248       // monitor thread started after the library shutdown. At shutdown it is
4249       // too late to cope with the problem, because when the master is in
4250       // DllMain (process detach) the monitor has no chances to start (it is
4251       // blocked), and master has no means to inform the monitor that the
4252       // library has gone, because all the memory which the monitor can access
4253       // is going to be released/reset.
4254       while (TCR_4(__kmp_init_monitor) < 2) {
4255         KMP_YIELD(TRUE);
4256       }
4257       KF_TRACE(10, ("after monitor thread has started\n"));
4258 #endif
4259     }
4260     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4261   }
4262 #endif
4263 
4264   KMP_MB();
4265   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4266     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4267   }
4268 
4269   /* allocate space for it. */
4270   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4271 
4272   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4273 
4274   if (__kmp_storage_map) {
4275     __kmp_print_thread_storage_map(new_thr, new_gtid);
4276   }
4277 
4278   // add the reserve serialized team, initialized from the team's master thread
4279   {
4280     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4281     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4282     new_thr->th.th_serial_team = serial_team =
4283         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4284 #if OMPT_SUPPORT
4285                                           0, // root parallel id
4286 #endif
4287 #if OMP_40_ENABLED
4288                                           proc_bind_default,
4289 #endif
4290                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4291   }
4292   KMP_ASSERT(serial_team);
4293   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4294   // execution (it is unused for now).
4295   serial_team->t.t_threads[0] = new_thr;
4296   KF_TRACE(10,
4297            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4298             new_thr));
4299 
4300   /* setup the thread structures */
4301   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4302 
4303 #if USE_FAST_MEMORY
4304   __kmp_initialize_fast_memory(new_thr);
4305 #endif /* USE_FAST_MEMORY */
4306 
4307 #if KMP_USE_BGET
4308   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4309   __kmp_initialize_bget(new_thr);
4310 #endif
4311 
4312   __kmp_init_random(new_thr); // Initialize random number generator
4313 
4314   /* Initialize these only once when thread is grabbed for a team allocation */
4315   KA_TRACE(20,
4316            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4317             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4318 
4319   int b;
4320   kmp_balign_t *balign = new_thr->th.th_bar;
4321   for (b = 0; b < bs_last_barrier; ++b) {
4322     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4323     balign[b].bb.team = NULL;
4324     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4325     balign[b].bb.use_oncore_barrier = 0;
4326   }
4327 
4328   new_thr->th.th_spin_here = FALSE;
4329   new_thr->th.th_next_waiting = 0;
4330 
4331 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4332   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4333   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4334   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4335   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4336 #endif
4337 
4338   TCW_4(new_thr->th.th_in_pool, FALSE);
4339   new_thr->th.th_active_in_pool = FALSE;
4340   TCW_4(new_thr->th.th_active, TRUE);
4341 
4342   /* adjust the global counters */
4343   __kmp_all_nth++;
4344   __kmp_nth++;
4345 
4346   root->r.r_cg_nthreads++;
4347 
4348   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4349   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4350   if (__kmp_adjust_gtid_mode) {
4351     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4352       if (TCR_4(__kmp_gtid_mode) != 2) {
4353         TCW_4(__kmp_gtid_mode, 2);
4354       }
4355     } else {
4356       if (TCR_4(__kmp_gtid_mode) != 1) {
4357         TCW_4(__kmp_gtid_mode, 1);
4358       }
4359     }
4360   }
4361 
4362 #ifdef KMP_ADJUST_BLOCKTIME
4363   /* Adjust blocktime back to zero if necessary       */
4364   /* Middle initialization might not have occurred yet */
4365   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4366     if (__kmp_nth > __kmp_avail_proc) {
4367       __kmp_zero_bt = TRUE;
4368     }
4369   }
4370 #endif /* KMP_ADJUST_BLOCKTIME */
4371 
4372   /* actually fork it and create the new worker thread */
4373   KF_TRACE(
4374       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4375   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4376   KF_TRACE(10,
4377            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4378 
4379   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4380                 new_gtid));
4381   KMP_MB();
4382   return new_thr;
4383 }
4384 
4385 /* Reinitialize team for reuse.
4386    The hot team code calls this case at every fork barrier, so EPCC barrier
4387    test are extremely sensitive to changes in it, esp. writes to the team
4388    struct, which cause a cache invalidation in all threads.
4389    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4390 static void __kmp_reinitialize_team(kmp_team_t *team,
4391                                     kmp_internal_control_t *new_icvs,
4392                                     ident_t *loc) {
4393   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4394                 team->t.t_threads[0], team));
4395   KMP_DEBUG_ASSERT(team && new_icvs);
4396   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4397   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4398 
4399   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4400   // Copy ICVs to the master thread's implicit taskdata
4401   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4402   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4403 
4404   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4405                 team->t.t_threads[0], team));
4406 }
4407 
4408 /* Initialize the team data structure.
4409    This assumes the t_threads and t_max_nproc are already set.
4410    Also, we don't touch the arguments */
4411 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4412                                   kmp_internal_control_t *new_icvs,
4413                                   ident_t *loc) {
4414   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4415 
4416   /* verify */
4417   KMP_DEBUG_ASSERT(team);
4418   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4419   KMP_DEBUG_ASSERT(team->t.t_threads);
4420   KMP_MB();
4421 
4422   team->t.t_master_tid = 0; /* not needed */
4423   /* team->t.t_master_bar;        not needed */
4424   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4425   team->t.t_nproc = new_nproc;
4426 
4427   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4428   team->t.t_next_pool = NULL;
4429   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4430    * up hot team */
4431 
4432   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4433   team->t.t_invoke = NULL; /* not needed */
4434 
4435   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4436   team->t.t_sched = new_icvs->sched;
4437 
4438 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4439   team->t.t_fp_control_saved = FALSE; /* not needed */
4440   team->t.t_x87_fpu_control_word = 0; /* not needed */
4441   team->t.t_mxcsr = 0; /* not needed */
4442 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4443 
4444   team->t.t_construct = 0;
4445 
4446   team->t.t_ordered.dt.t_value = 0;
4447   team->t.t_master_active = FALSE;
4448 
4449   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4450 
4451 #ifdef KMP_DEBUG
4452   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4453 #endif
4454   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4455 
4456   team->t.t_control_stack_top = NULL;
4457 
4458   __kmp_reinitialize_team(team, new_icvs, loc);
4459 
4460   KMP_MB();
4461   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4462 }
4463 
4464 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4465 /* Sets full mask for thread and returns old mask, no changes to structures. */
4466 static void
4467 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4468   if (KMP_AFFINITY_CAPABLE()) {
4469     int status;
4470     if (old_mask != NULL) {
4471       status = __kmp_get_system_affinity(old_mask, TRUE);
4472       int error = errno;
4473       if (status != 0) {
4474         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4475                     __kmp_msg_null);
4476       }
4477     }
4478     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4479   }
4480 }
4481 #endif
4482 
4483 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4484 
4485 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4486 // It calculats the worker + master thread's partition based upon the parent
4487 // thread's partition, and binds each worker to a thread in their partition.
4488 // The master thread's partition should already include its current binding.
4489 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4490   // Copy the master thread's place partion to the team struct
4491   kmp_info_t *master_th = team->t.t_threads[0];
4492   KMP_DEBUG_ASSERT(master_th != NULL);
4493   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4494   int first_place = master_th->th.th_first_place;
4495   int last_place = master_th->th.th_last_place;
4496   int masters_place = master_th->th.th_current_place;
4497   team->t.t_first_place = first_place;
4498   team->t.t_last_place = last_place;
4499 
4500   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4501                 "bound to place %d partition = [%d,%d]\n",
4502                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4503                 team->t.t_id, masters_place, first_place, last_place));
4504 
4505   switch (proc_bind) {
4506 
4507   case proc_bind_default:
4508     // serial teams might have the proc_bind policy set to proc_bind_default. It
4509     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4510     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4511     break;
4512 
4513   case proc_bind_master: {
4514     int f;
4515     int n_th = team->t.t_nproc;
4516     for (f = 1; f < n_th; f++) {
4517       kmp_info_t *th = team->t.t_threads[f];
4518       KMP_DEBUG_ASSERT(th != NULL);
4519       th->th.th_first_place = first_place;
4520       th->th.th_last_place = last_place;
4521       th->th.th_new_place = masters_place;
4522 
4523       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4524                      "partition = [%d,%d]\n",
4525                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4526                      f, masters_place, first_place, last_place));
4527     }
4528   } break;
4529 
4530   case proc_bind_close: {
4531     int f;
4532     int n_th = team->t.t_nproc;
4533     int n_places;
4534     if (first_place <= last_place) {
4535       n_places = last_place - first_place + 1;
4536     } else {
4537       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4538     }
4539     if (n_th <= n_places) {
4540       int place = masters_place;
4541       for (f = 1; f < n_th; f++) {
4542         kmp_info_t *th = team->t.t_threads[f];
4543         KMP_DEBUG_ASSERT(th != NULL);
4544 
4545         if (place == last_place) {
4546           place = first_place;
4547         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4548           place = 0;
4549         } else {
4550           place++;
4551         }
4552         th->th.th_first_place = first_place;
4553         th->th.th_last_place = last_place;
4554         th->th.th_new_place = place;
4555 
4556         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4557                        "partition = [%d,%d]\n",
4558                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4559                        team->t.t_id, f, place, first_place, last_place));
4560       }
4561     } else {
4562       int S, rem, gap, s_count;
4563       S = n_th / n_places;
4564       s_count = 0;
4565       rem = n_th - (S * n_places);
4566       gap = rem > 0 ? n_places / rem : n_places;
4567       int place = masters_place;
4568       int gap_ct = gap;
4569       for (f = 0; f < n_th; f++) {
4570         kmp_info_t *th = team->t.t_threads[f];
4571         KMP_DEBUG_ASSERT(th != NULL);
4572 
4573         th->th.th_first_place = first_place;
4574         th->th.th_last_place = last_place;
4575         th->th.th_new_place = place;
4576         s_count++;
4577 
4578         if ((s_count == S) && rem && (gap_ct == gap)) {
4579           // do nothing, add an extra thread to place on next iteration
4580         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4581           // we added an extra thread to this place; move to next place
4582           if (place == last_place) {
4583             place = first_place;
4584           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4585             place = 0;
4586           } else {
4587             place++;
4588           }
4589           s_count = 0;
4590           gap_ct = 1;
4591           rem--;
4592         } else if (s_count == S) { // place full; don't add extra
4593           if (place == last_place) {
4594             place = first_place;
4595           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4596             place = 0;
4597           } else {
4598             place++;
4599           }
4600           gap_ct++;
4601           s_count = 0;
4602         }
4603 
4604         KA_TRACE(100,
4605                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4606                   "partition = [%d,%d]\n",
4607                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4608                   th->th.th_new_place, first_place, last_place));
4609       }
4610       KMP_DEBUG_ASSERT(place == masters_place);
4611     }
4612   } break;
4613 
4614   case proc_bind_spread: {
4615     int f;
4616     int n_th = team->t.t_nproc;
4617     int n_places;
4618     int thidx;
4619     if (first_place <= last_place) {
4620       n_places = last_place - first_place + 1;
4621     } else {
4622       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4623     }
4624     if (n_th <= n_places) {
4625       int place = -1;
4626 
4627       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4628         int S = n_places / n_th;
4629         int s_count, rem, gap, gap_ct;
4630 
4631         place = masters_place;
4632         rem = n_places - n_th * S;
4633         gap = rem ? n_th / rem : 1;
4634         gap_ct = gap;
4635         thidx = n_th;
4636         if (update_master_only == 1)
4637           thidx = 1;
4638         for (f = 0; f < thidx; f++) {
4639           kmp_info_t *th = team->t.t_threads[f];
4640           KMP_DEBUG_ASSERT(th != NULL);
4641 
4642           th->th.th_first_place = place;
4643           th->th.th_new_place = place;
4644           s_count = 1;
4645           while (s_count < S) {
4646             if (place == last_place) {
4647               place = first_place;
4648             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4649               place = 0;
4650             } else {
4651               place++;
4652             }
4653             s_count++;
4654           }
4655           if (rem && (gap_ct == gap)) {
4656             if (place == last_place) {
4657               place = first_place;
4658             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4659               place = 0;
4660             } else {
4661               place++;
4662             }
4663             rem--;
4664             gap_ct = 0;
4665           }
4666           th->th.th_last_place = place;
4667           gap_ct++;
4668 
4669           if (place == last_place) {
4670             place = first_place;
4671           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4672             place = 0;
4673           } else {
4674             place++;
4675           }
4676 
4677           KA_TRACE(100,
4678                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4679                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4680                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4681                     f, th->th.th_new_place, th->th.th_first_place,
4682                     th->th.th_last_place, __kmp_affinity_num_masks));
4683         }
4684       } else {
4685         /* Having uniform space of available computation places I can create
4686            T partitions of round(P/T) size and put threads into the first
4687            place of each partition. */
4688         double current = static_cast<double>(masters_place);
4689         double spacing =
4690             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4691         int first, last;
4692         kmp_info_t *th;
4693 
4694         thidx = n_th + 1;
4695         if (update_master_only == 1)
4696           thidx = 1;
4697         for (f = 0; f < thidx; f++) {
4698           first = static_cast<int>(current);
4699           last = static_cast<int>(current + spacing) - 1;
4700           KMP_DEBUG_ASSERT(last >= first);
4701           if (first >= n_places) {
4702             if (masters_place) {
4703               first -= n_places;
4704               last -= n_places;
4705               if (first == (masters_place + 1)) {
4706                 KMP_DEBUG_ASSERT(f == n_th);
4707                 first--;
4708               }
4709               if (last == masters_place) {
4710                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4711                 last--;
4712               }
4713             } else {
4714               KMP_DEBUG_ASSERT(f == n_th);
4715               first = 0;
4716               last = 0;
4717             }
4718           }
4719           if (last >= n_places) {
4720             last = (n_places - 1);
4721           }
4722           place = first;
4723           current += spacing;
4724           if (f < n_th) {
4725             KMP_DEBUG_ASSERT(0 <= first);
4726             KMP_DEBUG_ASSERT(n_places > first);
4727             KMP_DEBUG_ASSERT(0 <= last);
4728             KMP_DEBUG_ASSERT(n_places > last);
4729             KMP_DEBUG_ASSERT(last_place >= first_place);
4730             th = team->t.t_threads[f];
4731             KMP_DEBUG_ASSERT(th);
4732             th->th.th_first_place = first;
4733             th->th.th_new_place = place;
4734             th->th.th_last_place = last;
4735 
4736             KA_TRACE(100,
4737                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4738                       "partition = [%d,%d], spacing = %.4f\n",
4739                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4740                       team->t.t_id, f, th->th.th_new_place,
4741                       th->th.th_first_place, th->th.th_last_place, spacing));
4742           }
4743         }
4744       }
4745       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4746     } else {
4747       int S, rem, gap, s_count;
4748       S = n_th / n_places;
4749       s_count = 0;
4750       rem = n_th - (S * n_places);
4751       gap = rem > 0 ? n_places / rem : n_places;
4752       int place = masters_place;
4753       int gap_ct = gap;
4754       thidx = n_th;
4755       if (update_master_only == 1)
4756         thidx = 1;
4757       for (f = 0; f < thidx; f++) {
4758         kmp_info_t *th = team->t.t_threads[f];
4759         KMP_DEBUG_ASSERT(th != NULL);
4760 
4761         th->th.th_first_place = place;
4762         th->th.th_last_place = place;
4763         th->th.th_new_place = place;
4764         s_count++;
4765 
4766         if ((s_count == S) && rem && (gap_ct == gap)) {
4767           // do nothing, add an extra thread to place on next iteration
4768         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4769           // we added an extra thread to this place; move on to next place
4770           if (place == last_place) {
4771             place = first_place;
4772           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4773             place = 0;
4774           } else {
4775             place++;
4776           }
4777           s_count = 0;
4778           gap_ct = 1;
4779           rem--;
4780         } else if (s_count == S) { // place is full; don't add extra thread
4781           if (place == last_place) {
4782             place = first_place;
4783           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4784             place = 0;
4785           } else {
4786             place++;
4787           }
4788           gap_ct++;
4789           s_count = 0;
4790         }
4791 
4792         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4793                        "partition = [%d,%d]\n",
4794                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4795                        team->t.t_id, f, th->th.th_new_place,
4796                        th->th.th_first_place, th->th.th_last_place));
4797       }
4798       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4799     }
4800   } break;
4801 
4802   default:
4803     break;
4804   }
4805 
4806   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4807 }
4808 
4809 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4810 
4811 /* allocate a new team data structure to use.  take one off of the free pool if
4812    available */
4813 kmp_team_t *
4814 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4815 #if OMPT_SUPPORT
4816                     ompt_parallel_id_t ompt_parallel_id,
4817 #endif
4818 #if OMP_40_ENABLED
4819                     kmp_proc_bind_t new_proc_bind,
4820 #endif
4821                     kmp_internal_control_t *new_icvs,
4822                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4823   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4824   int f;
4825   kmp_team_t *team;
4826   int use_hot_team = !root->r.r_active;
4827   int level = 0;
4828 
4829   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4830   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4831   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4832   KMP_MB();
4833 
4834 #if KMP_NESTED_HOT_TEAMS
4835   kmp_hot_team_ptr_t *hot_teams;
4836   if (master) {
4837     team = master->th.th_team;
4838     level = team->t.t_active_level;
4839     if (master->th.th_teams_microtask) { // in teams construct?
4840       if (master->th.th_teams_size.nteams > 1 &&
4841           ( // #teams > 1
4842               team->t.t_pkfn ==
4843                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4844               master->th.th_teams_level <
4845                   team->t.t_level)) { // or nested parallel inside the teams
4846         ++level; // not increment if #teams==1, or for outer fork of the teams;
4847         // increment otherwise
4848       }
4849     }
4850     hot_teams = master->th.th_hot_teams;
4851     if (level < __kmp_hot_teams_max_level && hot_teams &&
4852         hot_teams[level]
4853             .hot_team) { // hot team has already been allocated for given level
4854       use_hot_team = 1;
4855     } else {
4856       use_hot_team = 0;
4857     }
4858   }
4859 #endif
4860   // Optimization to use a "hot" team
4861   if (use_hot_team && new_nproc > 1) {
4862     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4863 #if KMP_NESTED_HOT_TEAMS
4864     team = hot_teams[level].hot_team;
4865 #else
4866     team = root->r.r_hot_team;
4867 #endif
4868 #if KMP_DEBUG
4869     if (__kmp_tasking_mode != tskm_immediate_exec) {
4870       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4871                     "task_team[1] = %p before reinit\n",
4872                     team->t.t_task_team[0], team->t.t_task_team[1]));
4873     }
4874 #endif
4875 
4876     // Has the number of threads changed?
4877     /* Let's assume the most common case is that the number of threads is
4878        unchanged, and put that case first. */
4879     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4880       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4881       // This case can mean that omp_set_num_threads() was called and the hot
4882       // team size was already reduced, so we check the special flag
4883       if (team->t.t_size_changed == -1) {
4884         team->t.t_size_changed = 1;
4885       } else {
4886         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4887       }
4888 
4889       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4890       kmp_r_sched_t new_sched = new_icvs->sched;
4891       if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4892           team->t.t_sched.chunk != new_sched.chunk)
4893         team->t.t_sched =
4894             new_sched; // set master's schedule as new run-time schedule
4895 
4896       __kmp_reinitialize_team(team, new_icvs,
4897                               root->r.r_uber_thread->th.th_ident);
4898 
4899       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4900                     team->t.t_threads[0], team));
4901       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4902 
4903 #if OMP_40_ENABLED
4904 #if KMP_AFFINITY_SUPPORTED
4905       if ((team->t.t_size_changed == 0) &&
4906           (team->t.t_proc_bind == new_proc_bind)) {
4907         if (new_proc_bind == proc_bind_spread) {
4908           __kmp_partition_places(
4909               team, 1); // add flag to update only master for spread
4910         }
4911         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4912                        "proc_bind = %d, partition = [%d,%d]\n",
4913                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4914                        team->t.t_last_place));
4915       } else {
4916         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4917         __kmp_partition_places(team);
4918       }
4919 #else
4920       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4921 #endif /* KMP_AFFINITY_SUPPORTED */
4922 #endif /* OMP_40_ENABLED */
4923     } else if (team->t.t_nproc > new_nproc) {
4924       KA_TRACE(20,
4925                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4926                 new_nproc));
4927 
4928       team->t.t_size_changed = 1;
4929 #if KMP_NESTED_HOT_TEAMS
4930       if (__kmp_hot_teams_mode == 0) {
4931         // AC: saved number of threads should correspond to team's value in this
4932         // mode, can be bigger in mode 1, when hot team has threads in reserve
4933         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4934         hot_teams[level].hot_team_nth = new_nproc;
4935 #endif // KMP_NESTED_HOT_TEAMS
4936         /* release the extra threads we don't need any more */
4937         for (f = new_nproc; f < team->t.t_nproc; f++) {
4938           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4939           if (__kmp_tasking_mode != tskm_immediate_exec) {
4940             // When decreasing team size, threads no longer in the team should
4941             // unref task team.
4942             team->t.t_threads[f]->th.th_task_team = NULL;
4943           }
4944           __kmp_free_thread(team->t.t_threads[f]);
4945           team->t.t_threads[f] = NULL;
4946         }
4947 #if KMP_NESTED_HOT_TEAMS
4948       } // (__kmp_hot_teams_mode == 0)
4949       else {
4950         // When keeping extra threads in team, switch threads to wait on own
4951         // b_go flag
4952         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4953           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4954           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4955           for (int b = 0; b < bs_last_barrier; ++b) {
4956             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4957               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4958             }
4959             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4960           }
4961         }
4962       }
4963 #endif // KMP_NESTED_HOT_TEAMS
4964       team->t.t_nproc = new_nproc;
4965       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4966       if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4967           team->t.t_sched.chunk != new_icvs->sched.chunk)
4968         team->t.t_sched = new_icvs->sched;
4969       __kmp_reinitialize_team(team, new_icvs,
4970                               root->r.r_uber_thread->th.th_ident);
4971 
4972       /* update the remaining threads */
4973       for (f = 0; f < new_nproc; ++f) {
4974         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4975       }
4976       // restore the current task state of the master thread: should be the
4977       // implicit task
4978       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4979                     team->t.t_threads[0], team));
4980 
4981       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4982 
4983 #ifdef KMP_DEBUG
4984       for (f = 0; f < team->t.t_nproc; f++) {
4985         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4986                          team->t.t_threads[f]->th.th_team_nproc ==
4987                              team->t.t_nproc);
4988       }
4989 #endif
4990 
4991 #if OMP_40_ENABLED
4992       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4993 #if KMP_AFFINITY_SUPPORTED
4994       __kmp_partition_places(team);
4995 #endif
4996 #endif
4997     } else { // team->t.t_nproc < new_nproc
4998 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4999       kmp_affin_mask_t *old_mask;
5000       if (KMP_AFFINITY_CAPABLE()) {
5001         KMP_CPU_ALLOC(old_mask);
5002       }
5003 #endif
5004 
5005       KA_TRACE(20,
5006                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5007                 new_nproc));
5008 
5009       team->t.t_size_changed = 1;
5010 
5011 #if KMP_NESTED_HOT_TEAMS
5012       int avail_threads = hot_teams[level].hot_team_nth;
5013       if (new_nproc < avail_threads)
5014         avail_threads = new_nproc;
5015       kmp_info_t **other_threads = team->t.t_threads;
5016       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5017         // Adjust barrier data of reserved threads (if any) of the team
5018         // Other data will be set in __kmp_initialize_info() below.
5019         int b;
5020         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5021         for (b = 0; b < bs_last_barrier; ++b) {
5022           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5023           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5024 #if USE_DEBUGGER
5025           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5026 #endif
5027         }
5028       }
5029       if (hot_teams[level].hot_team_nth >= new_nproc) {
5030         // we have all needed threads in reserve, no need to allocate any
5031         // this only possible in mode 1, cannot have reserved threads in mode 0
5032         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5033         team->t.t_nproc = new_nproc; // just get reserved threads involved
5034       } else {
5035         // we may have some threads in reserve, but not enough
5036         team->t.t_nproc =
5037             hot_teams[level]
5038                 .hot_team_nth; // get reserved threads involved if any
5039         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5040 #endif // KMP_NESTED_HOT_TEAMS
5041         if (team->t.t_max_nproc < new_nproc) {
5042           /* reallocate larger arrays */
5043           __kmp_reallocate_team_arrays(team, new_nproc);
5044           __kmp_reinitialize_team(team, new_icvs, NULL);
5045         }
5046 
5047 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5048         /* Temporarily set full mask for master thread before creation of
5049            workers. The reason is that workers inherit the affinity from master,
5050            so if a lot of workers are created on the single core quickly, they
5051            don't get a chance to set their own affinity for a long time. */
5052         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5053 #endif
5054 
5055         /* allocate new threads for the hot team */
5056         for (f = team->t.t_nproc; f < new_nproc; f++) {
5057           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5058           KMP_DEBUG_ASSERT(new_worker);
5059           team->t.t_threads[f] = new_worker;
5060 
5061           KA_TRACE(20,
5062                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5063                     "join=%llu, plain=%llu\n",
5064                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5065                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5066                     team->t.t_bar[bs_plain_barrier].b_arrived));
5067 
5068           { // Initialize barrier data for new threads.
5069             int b;
5070             kmp_balign_t *balign = new_worker->th.th_bar;
5071             for (b = 0; b < bs_last_barrier; ++b) {
5072               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5073               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5074                                KMP_BARRIER_PARENT_FLAG);
5075 #if USE_DEBUGGER
5076               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5077 #endif
5078             }
5079           }
5080         }
5081 
5082 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5083         if (KMP_AFFINITY_CAPABLE()) {
5084           /* Restore initial master thread's affinity mask */
5085           __kmp_set_system_affinity(old_mask, TRUE);
5086           KMP_CPU_FREE(old_mask);
5087         }
5088 #endif
5089 #if KMP_NESTED_HOT_TEAMS
5090       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5091 #endif // KMP_NESTED_HOT_TEAMS
5092       /* make sure everyone is syncronized */
5093       int old_nproc = team->t.t_nproc; // save old value and use to update only
5094       // new threads below
5095       __kmp_initialize_team(team, new_nproc, new_icvs,
5096                             root->r.r_uber_thread->th.th_ident);
5097 
5098       /* reinitialize the threads */
5099       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5100       for (f = 0; f < team->t.t_nproc; ++f)
5101         __kmp_initialize_info(team->t.t_threads[f], team, f,
5102                               __kmp_gtid_from_tid(f, team));
5103       if (level) { // set th_task_state for new threads in nested hot team
5104         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5105         // only need to set the th_task_state for the new threads. th_task_state
5106         // for master thread will not be accurate until after this in
5107         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5108         // correct value.
5109         for (f = old_nproc; f < team->t.t_nproc; ++f)
5110           team->t.t_threads[f]->th.th_task_state =
5111               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5112       } else { // set th_task_state for new threads in non-nested hot team
5113         int old_state =
5114             team->t.t_threads[0]->th.th_task_state; // copy master's state
5115         for (f = old_nproc; f < team->t.t_nproc; ++f)
5116           team->t.t_threads[f]->th.th_task_state = old_state;
5117       }
5118 
5119 #ifdef KMP_DEBUG
5120       for (f = 0; f < team->t.t_nproc; ++f) {
5121         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5122                          team->t.t_threads[f]->th.th_team_nproc ==
5123                              team->t.t_nproc);
5124       }
5125 #endif
5126 
5127 #if OMP_40_ENABLED
5128       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5129 #if KMP_AFFINITY_SUPPORTED
5130       __kmp_partition_places(team);
5131 #endif
5132 #endif
5133     } // Check changes in number of threads
5134 
5135 #if OMP_40_ENABLED
5136     kmp_info_t *master = team->t.t_threads[0];
5137     if (master->th.th_teams_microtask) {
5138       for (f = 1; f < new_nproc; ++f) {
5139         // propagate teams construct specific info to workers
5140         kmp_info_t *thr = team->t.t_threads[f];
5141         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5142         thr->th.th_teams_level = master->th.th_teams_level;
5143         thr->th.th_teams_size = master->th.th_teams_size;
5144       }
5145     }
5146 #endif /* OMP_40_ENABLED */
5147 #if KMP_NESTED_HOT_TEAMS
5148     if (level) {
5149       // Sync barrier state for nested hot teams, not needed for outermost hot
5150       // team.
5151       for (f = 1; f < new_nproc; ++f) {
5152         kmp_info_t *thr = team->t.t_threads[f];
5153         int b;
5154         kmp_balign_t *balign = thr->th.th_bar;
5155         for (b = 0; b < bs_last_barrier; ++b) {
5156           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5157           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5158 #if USE_DEBUGGER
5159           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5160 #endif
5161         }
5162       }
5163     }
5164 #endif // KMP_NESTED_HOT_TEAMS
5165 
5166     /* reallocate space for arguments if necessary */
5167     __kmp_alloc_argv_entries(argc, team, TRUE);
5168     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5169     // The hot team re-uses the previous task team,
5170     // if untouched during the previous release->gather phase.
5171 
5172     KF_TRACE(10, (" hot_team = %p\n", team));
5173 
5174 #if KMP_DEBUG
5175     if (__kmp_tasking_mode != tskm_immediate_exec) {
5176       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5177                     "task_team[1] = %p after reinit\n",
5178                     team->t.t_task_team[0], team->t.t_task_team[1]));
5179     }
5180 #endif
5181 
5182 #if OMPT_SUPPORT
5183     __ompt_team_assign_id(team, ompt_parallel_id);
5184 #endif
5185 
5186     KMP_MB();
5187 
5188     return team;
5189   }
5190 
5191   /* next, let's try to take one from the team pool */
5192   KMP_MB();
5193   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5194     /* TODO: consider resizing undersized teams instead of reaping them, now
5195        that we have a resizing mechanism */
5196     if (team->t.t_max_nproc >= max_nproc) {
5197       /* take this team from the team pool */
5198       __kmp_team_pool = team->t.t_next_pool;
5199 
5200       /* setup the team for fresh use */
5201       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5202 
5203       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5204                     "task_team[1] %p to NULL\n",
5205                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5206       team->t.t_task_team[0] = NULL;
5207       team->t.t_task_team[1] = NULL;
5208 
5209       /* reallocate space for arguments if necessary */
5210       __kmp_alloc_argv_entries(argc, team, TRUE);
5211       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5212 
5213       KA_TRACE(
5214           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5215                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5216       { // Initialize barrier data.
5217         int b;
5218         for (b = 0; b < bs_last_barrier; ++b) {
5219           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5220 #if USE_DEBUGGER
5221           team->t.t_bar[b].b_master_arrived = 0;
5222           team->t.t_bar[b].b_team_arrived = 0;
5223 #endif
5224         }
5225       }
5226 
5227 #if OMP_40_ENABLED
5228       team->t.t_proc_bind = new_proc_bind;
5229 #endif
5230 
5231       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5232                     team->t.t_id));
5233 
5234 #if OMPT_SUPPORT
5235       __ompt_team_assign_id(team, ompt_parallel_id);
5236 #endif
5237 
5238       KMP_MB();
5239 
5240       return team;
5241     }
5242 
5243     /* reap team if it is too small, then loop back and check the next one */
5244     // not sure if this is wise, but, will be redone during the hot-teams
5245     // rewrite.
5246     /* TODO: Use technique to find the right size hot-team, don't reap them */
5247     team = __kmp_reap_team(team);
5248     __kmp_team_pool = team;
5249   }
5250 
5251   /* nothing available in the pool, no matter, make a new team! */
5252   KMP_MB();
5253   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5254 
5255   /* and set it up */
5256   team->t.t_max_nproc = max_nproc;
5257   /* NOTE well, for some reason allocating one big buffer and dividing it up
5258      seems to really hurt performance a lot on the P4, so, let's not use this */
5259   __kmp_allocate_team_arrays(team, max_nproc);
5260 
5261   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5262   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5263 
5264   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5265                 "%p to NULL\n",
5266                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5267   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5268   // memory, no need to duplicate
5269   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5270   // memory, no need to duplicate
5271 
5272   if (__kmp_storage_map) {
5273     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5274   }
5275 
5276   /* allocate space for arguments */
5277   __kmp_alloc_argv_entries(argc, team, FALSE);
5278   team->t.t_argc = argc;
5279 
5280   KA_TRACE(20,
5281            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5282             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5283   { // Initialize barrier data.
5284     int b;
5285     for (b = 0; b < bs_last_barrier; ++b) {
5286       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5287 #if USE_DEBUGGER
5288       team->t.t_bar[b].b_master_arrived = 0;
5289       team->t.t_bar[b].b_team_arrived = 0;
5290 #endif
5291     }
5292   }
5293 
5294 #if OMP_40_ENABLED
5295   team->t.t_proc_bind = new_proc_bind;
5296 #endif
5297 
5298 #if OMPT_SUPPORT
5299   __ompt_team_assign_id(team, ompt_parallel_id);
5300   team->t.ompt_serialized_team_info = NULL;
5301 #endif
5302 
5303   KMP_MB();
5304 
5305   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5306                 team->t.t_id));
5307 
5308   return team;
5309 }
5310 
5311 /* TODO implement hot-teams at all levels */
5312 /* TODO implement lazy thread release on demand (disband request) */
5313 
5314 /* free the team.  return it to the team pool.  release all the threads
5315  * associated with it */
5316 void __kmp_free_team(kmp_root_t *root,
5317                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5318   int f;
5319   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5320                 team->t.t_id));
5321 
5322   /* verify state */
5323   KMP_DEBUG_ASSERT(root);
5324   KMP_DEBUG_ASSERT(team);
5325   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5326   KMP_DEBUG_ASSERT(team->t.t_threads);
5327 
5328   int use_hot_team = team == root->r.r_hot_team;
5329 #if KMP_NESTED_HOT_TEAMS
5330   int level;
5331   kmp_hot_team_ptr_t *hot_teams;
5332   if (master) {
5333     level = team->t.t_active_level - 1;
5334     if (master->th.th_teams_microtask) { // in teams construct?
5335       if (master->th.th_teams_size.nteams > 1) {
5336         ++level; // level was not increased in teams construct for
5337         // team_of_masters
5338       }
5339       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5340           master->th.th_teams_level == team->t.t_level) {
5341         ++level; // level was not increased in teams construct for
5342         // team_of_workers before the parallel
5343       } // team->t.t_level will be increased inside parallel
5344     }
5345     hot_teams = master->th.th_hot_teams;
5346     if (level < __kmp_hot_teams_max_level) {
5347       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5348       use_hot_team = 1;
5349     }
5350   }
5351 #endif // KMP_NESTED_HOT_TEAMS
5352 
5353   /* team is done working */
5354   TCW_SYNC_PTR(team->t.t_pkfn,
5355                NULL); // Important for Debugging Support Library.
5356   team->t.t_copyin_counter = 0; // init counter for possible reuse
5357   // Do not reset pointer to parent team to NULL for hot teams.
5358 
5359   /* if we are non-hot team, release our threads */
5360   if (!use_hot_team) {
5361     if (__kmp_tasking_mode != tskm_immediate_exec) {
5362       // Wait for threads to reach reapable state
5363       for (f = 1; f < team->t.t_nproc; ++f) {
5364         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5365         kmp_info_t *th = team->t.t_threads[f];
5366         volatile kmp_uint32 *state = &th->th.th_reap_state;
5367         while (*state != KMP_SAFE_TO_REAP) {
5368 #if KMP_OS_WINDOWS
5369           // On Windows a thread can be killed at any time, check this
5370           DWORD ecode;
5371           if (!__kmp_is_thread_alive(th, &ecode)) {
5372             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5373             break;
5374           }
5375 #endif
5376           // first check if thread is sleeping
5377           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5378           if (fl.is_sleeping())
5379             fl.resume(__kmp_gtid_from_thread(th));
5380           KMP_CPU_PAUSE();
5381         }
5382       }
5383 
5384       // Delete task teams
5385       int tt_idx;
5386       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5387         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5388         if (task_team != NULL) {
5389           for (f = 0; f < team->t.t_nproc;
5390                ++f) { // Have all threads unref task teams
5391             team->t.t_threads[f]->th.th_task_team = NULL;
5392           }
5393           KA_TRACE(
5394               20,
5395               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5396                __kmp_get_gtid(), task_team, team->t.t_id));
5397 #if KMP_NESTED_HOT_TEAMS
5398           __kmp_free_task_team(master, task_team);
5399 #endif
5400           team->t.t_task_team[tt_idx] = NULL;
5401         }
5402       }
5403     }
5404 
5405     // Reset pointer to parent team only for non-hot teams.
5406     team->t.t_parent = NULL;
5407     team->t.t_level = 0;
5408     team->t.t_active_level = 0;
5409 
5410     /* free the worker threads */
5411     for (f = 1; f < team->t.t_nproc; ++f) {
5412       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5413       __kmp_free_thread(team->t.t_threads[f]);
5414       team->t.t_threads[f] = NULL;
5415     }
5416 
5417     /* put the team back in the team pool */
5418     /* TODO limit size of team pool, call reap_team if pool too large */
5419     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5420     __kmp_team_pool = (volatile kmp_team_t *)team;
5421   }
5422 
5423   KMP_MB();
5424 }
5425 
5426 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5427 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5428   kmp_team_t *next_pool = team->t.t_next_pool;
5429 
5430   KMP_DEBUG_ASSERT(team);
5431   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5432   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5433   KMP_DEBUG_ASSERT(team->t.t_threads);
5434   KMP_DEBUG_ASSERT(team->t.t_argv);
5435 
5436   /* TODO clean the threads that are a part of this? */
5437 
5438   /* free stuff */
5439   __kmp_free_team_arrays(team);
5440   if (team->t.t_argv != &team->t.t_inline_argv[0])
5441     __kmp_free((void *)team->t.t_argv);
5442   __kmp_free(team);
5443 
5444   KMP_MB();
5445   return next_pool;
5446 }
5447 
5448 // Free the thread.  Don't reap it, just place it on the pool of available
5449 // threads.
5450 //
5451 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5452 // binding for the affinity mechanism to be useful.
5453 //
5454 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5455 // However, we want to avoid a potential performance problem by always
5456 // scanning through the list to find the correct point at which to insert
5457 // the thread (potential N**2 behavior).  To do this we keep track of the
5458 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5459 // With single-level parallelism, threads will always be added to the tail
5460 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5461 // parallelism, all bets are off and we may need to scan through the entire
5462 // free list.
5463 //
5464 // This change also has a potentially large performance benefit, for some
5465 // applications.  Previously, as threads were freed from the hot team, they
5466 // would be placed back on the free list in inverse order.  If the hot team
5467 // grew back to it's original size, then the freed thread would be placed
5468 // back on the hot team in reverse order.  This could cause bad cache
5469 // locality problems on programs where the size of the hot team regularly
5470 // grew and shrunk.
5471 //
5472 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5473 void __kmp_free_thread(kmp_info_t *this_th) {
5474   int gtid;
5475   kmp_info_t **scan;
5476   kmp_root_t *root = this_th->th.th_root;
5477 
5478   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5479                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5480 
5481   KMP_DEBUG_ASSERT(this_th);
5482 
5483   // When moving thread to pool, switch thread to wait on own b_go flag, and
5484   // uninitialized (NULL team).
5485   int b;
5486   kmp_balign_t *balign = this_th->th.th_bar;
5487   for (b = 0; b < bs_last_barrier; ++b) {
5488     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5489       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5490     balign[b].bb.team = NULL;
5491     balign[b].bb.leaf_kids = 0;
5492   }
5493   this_th->th.th_task_state = 0;
5494 
5495   /* put thread back on the free pool */
5496   TCW_PTR(this_th->th.th_team, NULL);
5497   TCW_PTR(this_th->th.th_root, NULL);
5498   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5499 
5500   // If the __kmp_thread_pool_insert_pt is already past the new insert
5501   // point, then we need to re-scan the entire list.
5502   gtid = this_th->th.th_info.ds.ds_gtid;
5503   if (__kmp_thread_pool_insert_pt != NULL) {
5504     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5505     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5506       __kmp_thread_pool_insert_pt = NULL;
5507     }
5508   }
5509 
5510   // Scan down the list to find the place to insert the thread.
5511   // scan is the address of a link in the list, possibly the address of
5512   // __kmp_thread_pool itself.
5513   //
5514   // In the absence of nested parallism, the for loop will have 0 iterations.
5515   if (__kmp_thread_pool_insert_pt != NULL) {
5516     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5517   } else {
5518     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5519   }
5520   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5521        scan = &((*scan)->th.th_next_pool))
5522     ;
5523 
5524   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5525   // to its address.
5526   TCW_PTR(this_th->th.th_next_pool, *scan);
5527   __kmp_thread_pool_insert_pt = *scan = this_th;
5528   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5529                    (this_th->th.th_info.ds.ds_gtid <
5530                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5531   TCW_4(this_th->th.th_in_pool, TRUE);
5532   __kmp_thread_pool_nth++;
5533 
5534   TCW_4(__kmp_nth, __kmp_nth - 1);
5535   root->r.r_cg_nthreads--;
5536 
5537 #ifdef KMP_ADJUST_BLOCKTIME
5538   /* Adjust blocktime back to user setting or default if necessary */
5539   /* Middle initialization might never have occurred                */
5540   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5541     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5542     if (__kmp_nth <= __kmp_avail_proc) {
5543       __kmp_zero_bt = FALSE;
5544     }
5545   }
5546 #endif /* KMP_ADJUST_BLOCKTIME */
5547 
5548   KMP_MB();
5549 }
5550 
5551 /* ------------------------------------------------------------------------ */
5552 
5553 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5554   int gtid = this_thr->th.th_info.ds.ds_gtid;
5555   /*    void                 *stack_data;*/
5556   kmp_team_t *(*volatile pteam);
5557 
5558   KMP_MB();
5559   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5560 
5561   if (__kmp_env_consistency_check) {
5562     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5563   }
5564 
5565 #if OMPT_SUPPORT
5566   if (ompt_enabled) {
5567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5568     this_thr->th.ompt_thread_info.wait_id = 0;
5569     this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5570     if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5571       __ompt_thread_begin(ompt_thread_worker, gtid);
5572     }
5573   }
5574 #endif
5575 
5576   /* This is the place where threads wait for work */
5577   while (!TCR_4(__kmp_global.g.g_done)) {
5578     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5579     KMP_MB();
5580 
5581     /* wait for work to do */
5582     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5583 
5584 #if OMPT_SUPPORT
5585     if (ompt_enabled) {
5586       this_thr->th.ompt_thread_info.state = ompt_state_idle;
5587     }
5588 #endif
5589 
5590     /* No tid yet since not part of a team */
5591     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5592 
5593 #if OMPT_SUPPORT
5594     if (ompt_enabled) {
5595       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5596     }
5597 #endif
5598 
5599     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5600 
5601     /* have we been allocated? */
5602     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5603 #if OMPT_SUPPORT
5604       ompt_task_info_t *task_info;
5605       ompt_parallel_id_t my_parallel_id;
5606       if (ompt_enabled) {
5607         task_info = __ompt_get_taskinfo(0);
5608         my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5609       }
5610 #endif
5611       /* we were just woken up, so run our new task */
5612       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5613         int rc;
5614         KA_TRACE(20,
5615                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5616                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5617                   (*pteam)->t.t_pkfn));
5618 
5619         updateHWFPControl(*pteam);
5620 
5621 #if OMPT_SUPPORT
5622         if (ompt_enabled) {
5623           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5624           // Initialize OMPT task id for implicit task.
5625           int tid = __kmp_tid_from_gtid(gtid);
5626           task_info->task_id = __ompt_task_id_new(tid);
5627         }
5628 #endif
5629 
5630         {
5631           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5632           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5633           rc = (*pteam)->t.t_invoke(gtid);
5634         }
5635         KMP_ASSERT(rc);
5636 
5637 #if OMPT_SUPPORT
5638         if (ompt_enabled) {
5639           /* no frame set while outside task */
5640           task_info->frame.exit_runtime_frame = NULL;
5641 
5642           this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5643         }
5644 #endif
5645         KMP_MB();
5646         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5647                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5648                       (*pteam)->t.t_pkfn));
5649       }
5650       /* join barrier after parallel region */
5651       __kmp_join_barrier(gtid);
5652 #if OMPT_SUPPORT && OMPT_TRACE
5653       if (ompt_enabled) {
5654         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5655           // don't access *pteam here: it may have already been freed
5656           // by the master thread behind the barrier (possible race)
5657           ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5658               my_parallel_id, task_info->task_id);
5659         }
5660         task_info->frame.exit_runtime_frame = NULL;
5661         task_info->task_id = 0;
5662       }
5663 #endif
5664     }
5665   }
5666   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5667 
5668 #if OMPT_SUPPORT
5669   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5670     __ompt_thread_end(ompt_thread_worker, gtid);
5671   }
5672 #endif
5673 
5674   this_thr->th.th_task_team = NULL;
5675   /* run the destructors for the threadprivate data for this thread */
5676   __kmp_common_destroy_gtid(gtid);
5677 
5678   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5679   KMP_MB();
5680   return this_thr;
5681 }
5682 
5683 /* ------------------------------------------------------------------------ */
5684 
5685 void __kmp_internal_end_dest(void *specific_gtid) {
5686 #if KMP_COMPILER_ICC
5687 #pragma warning(push)
5688 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5689 // significant bits
5690 #endif
5691   // Make sure no significant bits are lost
5692   int gtid = (kmp_intptr_t)specific_gtid - 1;
5693 #if KMP_COMPILER_ICC
5694 #pragma warning(pop)
5695 #endif
5696 
5697   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5698   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5699    * this is because 0 is reserved for the nothing-stored case */
5700 
5701   /* josh: One reason for setting the gtid specific data even when it is being
5702      destroyed by pthread is to allow gtid lookup through thread specific data
5703      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5704      that gets executed in the call to __kmp_internal_end_thread, actually
5705      gets the gtid through the thread specific data.  Setting it here seems
5706      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5707      to run smoothly.
5708      todo: get rid of this after we remove the dependence on
5709      __kmp_gtid_get_specific  */
5710   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5711     __kmp_gtid_set_specific(gtid);
5712 #ifdef KMP_TDATA_GTID
5713   __kmp_gtid = gtid;
5714 #endif
5715   __kmp_internal_end_thread(gtid);
5716 }
5717 
5718 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5719 
5720 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5721 // destructors work perfectly, but in real libomp.so I have no evidence it is
5722 // ever called. However, -fini linker option in makefile.mk works fine.
5723 
5724 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5725   __kmp_internal_end_atexit();
5726 }
5727 
5728 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5729 
5730 #endif
5731 
5732 /* [Windows] josh: when the atexit handler is called, there may still be more
5733    than one thread alive */
5734 void __kmp_internal_end_atexit(void) {
5735   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5736   /* [Windows]
5737      josh: ideally, we want to completely shutdown the library in this atexit
5738      handler, but stat code that depends on thread specific data for gtid fails
5739      because that data becomes unavailable at some point during the shutdown, so
5740      we call __kmp_internal_end_thread instead. We should eventually remove the
5741      dependency on __kmp_get_specific_gtid in the stat code and use
5742      __kmp_internal_end_library to cleanly shutdown the library.
5743 
5744      // TODO: Can some of this comment about GVS be removed?
5745      I suspect that the offending stat code is executed when the calling thread
5746      tries to clean up a dead root thread's data structures, resulting in GVS
5747      code trying to close the GVS structures for that thread, but since the stat
5748      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5749      the calling thread is cleaning up itself instead of another thread, it get
5750      confused. This happens because allowing a thread to unregister and cleanup
5751      another thread is a recent modification for addressing an issue.
5752      Based on the current design (20050722), a thread may end up
5753      trying to unregister another thread only if thread death does not trigger
5754      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5755      thread specific data destructor function to detect thread death. For
5756      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5757      is nothing.  Thus, the workaround is applicable only for Windows static
5758      stat library. */
5759   __kmp_internal_end_library(-1);
5760 #if KMP_OS_WINDOWS
5761   __kmp_close_console();
5762 #endif
5763 }
5764 
5765 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5766   // It is assumed __kmp_forkjoin_lock is acquired.
5767 
5768   int gtid;
5769 
5770   KMP_DEBUG_ASSERT(thread != NULL);
5771 
5772   gtid = thread->th.th_info.ds.ds_gtid;
5773 
5774   if (!is_root) {
5775 
5776     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5777       /* Assume the threads are at the fork barrier here */
5778       KA_TRACE(
5779           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5780                gtid));
5781       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5782        * (GEH) */
5783       ANNOTATE_HAPPENS_BEFORE(thread);
5784       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5785       __kmp_release_64(&flag);
5786     }
5787 
5788     // Terminate OS thread.
5789     __kmp_reap_worker(thread);
5790 
5791     // The thread was killed asynchronously.  If it was actively
5792     // spinning in the thread pool, decrement the global count.
5793     //
5794     // There is a small timing hole here - if the worker thread was just waking
5795     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5796     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5797     // the global counter might not get updated.
5798     //
5799     // Currently, this can only happen as the library is unloaded,
5800     // so there are no harmful side effects.
5801     if (thread->th.th_active_in_pool) {
5802       thread->th.th_active_in_pool = FALSE;
5803       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5804       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5805     }
5806 
5807     // Decrement # of [worker] threads in the pool.
5808     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5809     --__kmp_thread_pool_nth;
5810   }
5811 
5812   __kmp_free_implicit_task(thread);
5813 
5814 // Free the fast memory for tasking
5815 #if USE_FAST_MEMORY
5816   __kmp_free_fast_memory(thread);
5817 #endif /* USE_FAST_MEMORY */
5818 
5819   __kmp_suspend_uninitialize_thread(thread);
5820 
5821   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5822   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5823 
5824   --__kmp_all_nth;
5825 // __kmp_nth was decremented when thread is added to the pool.
5826 
5827 #ifdef KMP_ADJUST_BLOCKTIME
5828   /* Adjust blocktime back to user setting or default if necessary */
5829   /* Middle initialization might never have occurred                */
5830   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5831     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5832     if (__kmp_nth <= __kmp_avail_proc) {
5833       __kmp_zero_bt = FALSE;
5834     }
5835   }
5836 #endif /* KMP_ADJUST_BLOCKTIME */
5837 
5838   /* free the memory being used */
5839   if (__kmp_env_consistency_check) {
5840     if (thread->th.th_cons) {
5841       __kmp_free_cons_stack(thread->th.th_cons);
5842       thread->th.th_cons = NULL;
5843     }
5844   }
5845 
5846   if (thread->th.th_pri_common != NULL) {
5847     __kmp_free(thread->th.th_pri_common);
5848     thread->th.th_pri_common = NULL;
5849   }
5850 
5851   if (thread->th.th_task_state_memo_stack != NULL) {
5852     __kmp_free(thread->th.th_task_state_memo_stack);
5853     thread->th.th_task_state_memo_stack = NULL;
5854   }
5855 
5856 #if KMP_USE_BGET
5857   if (thread->th.th_local.bget_data != NULL) {
5858     __kmp_finalize_bget(thread);
5859   }
5860 #endif
5861 
5862 #if KMP_AFFINITY_SUPPORTED
5863   if (thread->th.th_affin_mask != NULL) {
5864     KMP_CPU_FREE(thread->th.th_affin_mask);
5865     thread->th.th_affin_mask = NULL;
5866   }
5867 #endif /* KMP_AFFINITY_SUPPORTED */
5868 
5869   __kmp_reap_team(thread->th.th_serial_team);
5870   thread->th.th_serial_team = NULL;
5871   __kmp_free(thread);
5872 
5873   KMP_MB();
5874 
5875 } // __kmp_reap_thread
5876 
5877 static void __kmp_internal_end(void) {
5878   int i;
5879 
5880   /* First, unregister the library */
5881   __kmp_unregister_library();
5882 
5883 #if KMP_OS_WINDOWS
5884   /* In Win static library, we can't tell when a root actually dies, so we
5885      reclaim the data structures for any root threads that have died but not
5886      unregistered themselves, in order to shut down cleanly.
5887      In Win dynamic library we also can't tell when a thread dies.  */
5888   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5889 // dead roots
5890 #endif
5891 
5892   for (i = 0; i < __kmp_threads_capacity; i++)
5893     if (__kmp_root[i])
5894       if (__kmp_root[i]->r.r_active)
5895         break;
5896   KMP_MB(); /* Flush all pending memory write invalidates.  */
5897   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5898 
5899   if (i < __kmp_threads_capacity) {
5900 #if KMP_USE_MONITOR
5901     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5902     KMP_MB(); /* Flush all pending memory write invalidates.  */
5903 
5904     // Need to check that monitor was initialized before reaping it. If we are
5905     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5906     // __kmp_monitor will appear to contain valid data, but it is only valid in
5907     // the parent process, not the child.
5908     // New behavior (201008): instead of keying off of the flag
5909     // __kmp_init_parallel, the monitor thread creation is keyed off
5910     // of the new flag __kmp_init_monitor.
5911     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5912     if (TCR_4(__kmp_init_monitor)) {
5913       __kmp_reap_monitor(&__kmp_monitor);
5914       TCW_4(__kmp_init_monitor, 0);
5915     }
5916     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5917     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5918 #endif // KMP_USE_MONITOR
5919   } else {
5920 /* TODO move this to cleanup code */
5921 #ifdef KMP_DEBUG
5922     /* make sure that everything has properly ended */
5923     for (i = 0; i < __kmp_threads_capacity; i++) {
5924       if (__kmp_root[i]) {
5925         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5926         //                    there can be uber threads alive here
5927         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5928       }
5929     }
5930 #endif
5931 
5932     KMP_MB();
5933 
5934     // Reap the worker threads.
5935     // This is valid for now, but be careful if threads are reaped sooner.
5936     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5937       // Get the next thread from the pool.
5938       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5939       __kmp_thread_pool = thread->th.th_next_pool;
5940       // Reap it.
5941       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5942       thread->th.th_next_pool = NULL;
5943       thread->th.th_in_pool = FALSE;
5944       __kmp_reap_thread(thread, 0);
5945     }
5946     __kmp_thread_pool_insert_pt = NULL;
5947 
5948     // Reap teams.
5949     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5950       // Get the next team from the pool.
5951       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5952       __kmp_team_pool = team->t.t_next_pool;
5953       // Reap it.
5954       team->t.t_next_pool = NULL;
5955       __kmp_reap_team(team);
5956     }
5957 
5958     __kmp_reap_task_teams();
5959 
5960     for (i = 0; i < __kmp_threads_capacity; ++i) {
5961       // TBD: Add some checking...
5962       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5963     }
5964 
5965     /* Make sure all threadprivate destructors get run by joining with all
5966        worker threads before resetting this flag */
5967     TCW_SYNC_4(__kmp_init_common, FALSE);
5968 
5969     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5970     KMP_MB();
5971 
5972 #if KMP_USE_MONITOR
5973     // See note above: One of the possible fixes for CQ138434 / CQ140126
5974     //
5975     // FIXME: push both code fragments down and CSE them?
5976     // push them into __kmp_cleanup() ?
5977     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5978     if (TCR_4(__kmp_init_monitor)) {
5979       __kmp_reap_monitor(&__kmp_monitor);
5980       TCW_4(__kmp_init_monitor, 0);
5981     }
5982     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5983     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5984 #endif
5985   } /* else !__kmp_global.t_active */
5986   TCW_4(__kmp_init_gtid, FALSE);
5987   KMP_MB(); /* Flush all pending memory write invalidates.  */
5988 
5989   __kmp_cleanup();
5990 #if OMPT_SUPPORT
5991   ompt_fini();
5992 #endif
5993 }
5994 
5995 void __kmp_internal_end_library(int gtid_req) {
5996   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5997   /* this shouldn't be a race condition because __kmp_internal_end() is the
5998      only place to clear __kmp_serial_init */
5999   /* we'll check this later too, after we get the lock */
6000   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6001   // redundaant, because the next check will work in any case.
6002   if (__kmp_global.g.g_abort) {
6003     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6004     /* TODO abort? */
6005     return;
6006   }
6007   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6008     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6009     return;
6010   }
6011 
6012   KMP_MB(); /* Flush all pending memory write invalidates.  */
6013 
6014   /* find out who we are and what we should do */
6015   {
6016     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6017     KA_TRACE(
6018         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6019     if (gtid == KMP_GTID_SHUTDOWN) {
6020       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6021                     "already shutdown\n"));
6022       return;
6023     } else if (gtid == KMP_GTID_MONITOR) {
6024       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6025                     "registered, or system shutdown\n"));
6026       return;
6027     } else if (gtid == KMP_GTID_DNE) {
6028       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6029                     "shutdown\n"));
6030       /* we don't know who we are, but we may still shutdown the library */
6031     } else if (KMP_UBER_GTID(gtid)) {
6032       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6033       if (__kmp_root[gtid]->r.r_active) {
6034         __kmp_global.g.g_abort = -1;
6035         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6036         KA_TRACE(10,
6037                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6038                   gtid));
6039         return;
6040       } else {
6041         KA_TRACE(
6042             10,
6043             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6044         __kmp_unregister_root_current_thread(gtid);
6045       }
6046     } else {
6047 /* worker threads may call this function through the atexit handler, if they
6048  * call exit() */
6049 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6050    TODO: do a thorough shutdown instead */
6051 #ifdef DUMP_DEBUG_ON_EXIT
6052       if (__kmp_debug_buf)
6053         __kmp_dump_debug_buffer();
6054 #endif
6055       return;
6056     }
6057   }
6058   /* synchronize the termination process */
6059   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6060 
6061   /* have we already finished */
6062   if (__kmp_global.g.g_abort) {
6063     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6064     /* TODO abort? */
6065     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6066     return;
6067   }
6068   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6069     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6070     return;
6071   }
6072 
6073   /* We need this lock to enforce mutex between this reading of
6074      __kmp_threads_capacity and the writing by __kmp_register_root.
6075      Alternatively, we can use a counter of roots that is atomically updated by
6076      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6077      __kmp_internal_end_*.  */
6078   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6079 
6080   /* now we can safely conduct the actual termination */
6081   __kmp_internal_end();
6082 
6083   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6084   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6085 
6086   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6087 
6088 #ifdef DUMP_DEBUG_ON_EXIT
6089   if (__kmp_debug_buf)
6090     __kmp_dump_debug_buffer();
6091 #endif
6092 
6093 #if KMP_OS_WINDOWS
6094   __kmp_close_console();
6095 #endif
6096 
6097   __kmp_fini_allocator();
6098 
6099 } // __kmp_internal_end_library
6100 
6101 void __kmp_internal_end_thread(int gtid_req) {
6102   int i;
6103 
6104   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6105   /* this shouldn't be a race condition because __kmp_internal_end() is the
6106    * only place to clear __kmp_serial_init */
6107   /* we'll check this later too, after we get the lock */
6108   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6109   // redundant, because the next check will work in any case.
6110   if (__kmp_global.g.g_abort) {
6111     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6112     /* TODO abort? */
6113     return;
6114   }
6115   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6116     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6117     return;
6118   }
6119 
6120   KMP_MB(); /* Flush all pending memory write invalidates.  */
6121 
6122   /* find out who we are and what we should do */
6123   {
6124     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6125     KA_TRACE(10,
6126              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6127     if (gtid == KMP_GTID_SHUTDOWN) {
6128       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6129                     "already shutdown\n"));
6130       return;
6131     } else if (gtid == KMP_GTID_MONITOR) {
6132       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6133                     "registered, or system shutdown\n"));
6134       return;
6135     } else if (gtid == KMP_GTID_DNE) {
6136       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6137                     "shutdown\n"));
6138       return;
6139       /* we don't know who we are */
6140     } else if (KMP_UBER_GTID(gtid)) {
6141       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6142       if (__kmp_root[gtid]->r.r_active) {
6143         __kmp_global.g.g_abort = -1;
6144         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6145         KA_TRACE(10,
6146                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6147                   gtid));
6148         return;
6149       } else {
6150         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6151                       gtid));
6152         __kmp_unregister_root_current_thread(gtid);
6153       }
6154     } else {
6155       /* just a worker thread, let's leave */
6156       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6157 
6158       if (gtid >= 0) {
6159         __kmp_threads[gtid]->th.th_task_team = NULL;
6160       }
6161 
6162       KA_TRACE(10,
6163                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6164                 gtid));
6165       return;
6166     }
6167   }
6168 #if defined KMP_DYNAMIC_LIB
6169   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6170   // thread, because we will better shutdown later in the library destructor.
6171   // The reason of this change is performance problem when non-openmp thread in
6172   // a loop forks and joins many openmp threads. We can save a lot of time
6173   // keeping worker threads alive until the program shutdown.
6174   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6175   // and Windows(DPD200287443) that occurs when using critical sections from
6176   // foreign threads.
6177   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6178   return;
6179 #endif
6180   /* synchronize the termination process */
6181   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6182 
6183   /* have we already finished */
6184   if (__kmp_global.g.g_abort) {
6185     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6186     /* TODO abort? */
6187     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6188     return;
6189   }
6190   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6191     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6192     return;
6193   }
6194 
6195   /* We need this lock to enforce mutex between this reading of
6196      __kmp_threads_capacity and the writing by __kmp_register_root.
6197      Alternatively, we can use a counter of roots that is atomically updated by
6198      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6199      __kmp_internal_end_*.  */
6200 
6201   /* should we finish the run-time?  are all siblings done? */
6202   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6203 
6204   for (i = 0; i < __kmp_threads_capacity; ++i) {
6205     if (KMP_UBER_GTID(i)) {
6206       KA_TRACE(
6207           10,
6208           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6209       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6210       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6211       return;
6212     }
6213   }
6214 
6215   /* now we can safely conduct the actual termination */
6216 
6217   __kmp_internal_end();
6218 
6219   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6220   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6221 
6222   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6223 
6224 #ifdef DUMP_DEBUG_ON_EXIT
6225   if (__kmp_debug_buf)
6226     __kmp_dump_debug_buffer();
6227 #endif
6228 } // __kmp_internal_end_thread
6229 
6230 // -----------------------------------------------------------------------------
6231 // Library registration stuff.
6232 
6233 static long __kmp_registration_flag = 0;
6234 // Random value used to indicate library initialization.
6235 static char *__kmp_registration_str = NULL;
6236 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6237 
6238 static inline char *__kmp_reg_status_name() {
6239   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6240      each thread. If registration and unregistration go in different threads
6241      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6242      env var can not be found, because the name will contain different pid. */
6243   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6244 } // __kmp_reg_status_get
6245 
6246 void __kmp_register_library_startup(void) {
6247 
6248   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6249   int done = 0;
6250   union {
6251     double dtime;
6252     long ltime;
6253   } time;
6254 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6255   __kmp_initialize_system_tick();
6256 #endif
6257   __kmp_read_system_time(&time.dtime);
6258   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6259   __kmp_registration_str =
6260       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6261                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6262 
6263   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6264                 __kmp_registration_str));
6265 
6266   while (!done) {
6267 
6268     char *value = NULL; // Actual value of the environment variable.
6269 
6270     // Set environment variable, but do not overwrite if it is exist.
6271     __kmp_env_set(name, __kmp_registration_str, 0);
6272     // Check the variable is written.
6273     value = __kmp_env_get(name);
6274     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6275 
6276       done = 1; // Ok, environment variable set successfully, exit the loop.
6277 
6278     } else {
6279 
6280       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6281       // Check whether it alive or dead.
6282       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6283       char *tail = value;
6284       char *flag_addr_str = NULL;
6285       char *flag_val_str = NULL;
6286       char const *file_name = NULL;
6287       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6288       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6289       file_name = tail;
6290       if (tail != NULL) {
6291         long *flag_addr = 0;
6292         long flag_val = 0;
6293         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6294         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6295         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6296           // First, check whether environment-encoded address is mapped into
6297           // addr space.
6298           // If so, dereference it to see if it still has the right value.
6299           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6300             neighbor = 1;
6301           } else {
6302             // If not, then we know the other copy of the library is no longer
6303             // running.
6304             neighbor = 2;
6305           }
6306         }
6307       }
6308       switch (neighbor) {
6309       case 0: // Cannot parse environment variable -- neighbor status unknown.
6310         // Assume it is the incompatible format of future version of the
6311         // library. Assume the other library is alive.
6312         // WARN( ... ); // TODO: Issue a warning.
6313         file_name = "unknown library";
6314       // Attention! Falling to the next case. That's intentional.
6315       case 1: { // Neighbor is alive.
6316         // Check it is allowed.
6317         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6318         if (!__kmp_str_match_true(duplicate_ok)) {
6319           // That's not allowed. Issue fatal error.
6320           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6321                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6322         }
6323         KMP_INTERNAL_FREE(duplicate_ok);
6324         __kmp_duplicate_library_ok = 1;
6325         done = 1; // Exit the loop.
6326       } break;
6327       case 2: { // Neighbor is dead.
6328         // Clear the variable and try to register library again.
6329         __kmp_env_unset(name);
6330       } break;
6331       default: { KMP_DEBUG_ASSERT(0); } break;
6332       }
6333     }
6334     KMP_INTERNAL_FREE((void *)value);
6335   }
6336   KMP_INTERNAL_FREE((void *)name);
6337 
6338 } // func __kmp_register_library_startup
6339 
6340 void __kmp_unregister_library(void) {
6341 
6342   char *name = __kmp_reg_status_name();
6343   char *value = __kmp_env_get(name);
6344 
6345   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6346   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6347   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6348     // Ok, this is our variable. Delete it.
6349     __kmp_env_unset(name);
6350   }
6351 
6352   KMP_INTERNAL_FREE(__kmp_registration_str);
6353   KMP_INTERNAL_FREE(value);
6354   KMP_INTERNAL_FREE(name);
6355 
6356   __kmp_registration_flag = 0;
6357   __kmp_registration_str = NULL;
6358 
6359 } // __kmp_unregister_library
6360 
6361 // End of Library registration stuff.
6362 // -----------------------------------------------------------------------------
6363 
6364 #if KMP_MIC_SUPPORTED
6365 
6366 static void __kmp_check_mic_type() {
6367   kmp_cpuid_t cpuid_state = {0};
6368   kmp_cpuid_t *cs_p = &cpuid_state;
6369   __kmp_x86_cpuid(1, 0, cs_p);
6370   // We don't support mic1 at the moment
6371   if ((cs_p->eax & 0xff0) == 0xB10) {
6372     __kmp_mic_type = mic2;
6373   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6374     __kmp_mic_type = mic3;
6375   } else {
6376     __kmp_mic_type = non_mic;
6377   }
6378 }
6379 
6380 #endif /* KMP_MIC_SUPPORTED */
6381 
6382 static void __kmp_do_serial_initialize(void) {
6383   int i, gtid;
6384   int size;
6385 
6386   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6387 
6388   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6389   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6390   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6391   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6392   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6393 
6394 #if OMPT_SUPPORT
6395   ompt_pre_init();
6396 #endif
6397 
6398   __kmp_validate_locks();
6399 
6400   /* Initialize internal memory allocator */
6401   __kmp_init_allocator();
6402 
6403   /* Register the library startup via an environment variable and check to see
6404      whether another copy of the library is already registered. */
6405 
6406   __kmp_register_library_startup();
6407 
6408   /* TODO reinitialization of library */
6409   if (TCR_4(__kmp_global.g.g_done)) {
6410     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6411   }
6412 
6413   __kmp_global.g.g_abort = 0;
6414   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6415 
6416 /* initialize the locks */
6417 #if KMP_USE_ADAPTIVE_LOCKS
6418 #if KMP_DEBUG_ADAPTIVE_LOCKS
6419   __kmp_init_speculative_stats();
6420 #endif
6421 #endif
6422 #if KMP_STATS_ENABLED
6423   __kmp_stats_init();
6424 #endif
6425   __kmp_init_lock(&__kmp_global_lock);
6426   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6427   __kmp_init_lock(&__kmp_debug_lock);
6428   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6429   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6430   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6431   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6432   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6433   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6434   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6435   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6436   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6437   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6438   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6439   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6440   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6441   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6442   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6443 #if KMP_USE_MONITOR
6444   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6445 #endif
6446   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6447 
6448   /* conduct initialization and initial setup of configuration */
6449 
6450   __kmp_runtime_initialize();
6451 
6452 #if KMP_MIC_SUPPORTED
6453   __kmp_check_mic_type();
6454 #endif
6455 
6456 // Some global variable initialization moved here from kmp_env_initialize()
6457 #ifdef KMP_DEBUG
6458   kmp_diag = 0;
6459 #endif
6460   __kmp_abort_delay = 0;
6461 
6462   // From __kmp_init_dflt_team_nth()
6463   /* assume the entire machine will be used */
6464   __kmp_dflt_team_nth_ub = __kmp_xproc;
6465   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6466     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6467   }
6468   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6469     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6470   }
6471   __kmp_max_nth = __kmp_sys_max_nth;
6472   __kmp_cg_max_nth = __kmp_sys_max_nth;
6473   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6474   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6475     __kmp_teams_max_nth = __kmp_sys_max_nth;
6476   }
6477 
6478   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6479   // part
6480   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6481 #if KMP_USE_MONITOR
6482   __kmp_monitor_wakeups =
6483       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6484   __kmp_bt_intervals =
6485       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6486 #endif
6487   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6488   __kmp_library = library_throughput;
6489   // From KMP_SCHEDULE initialization
6490   __kmp_static = kmp_sch_static_balanced;
6491 // AC: do not use analytical here, because it is non-monotonous
6492 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6493 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6494 // need to repeat assignment
6495 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6496 // bit control and barrier method control parts
6497 #if KMP_FAST_REDUCTION_BARRIER
6498 #define kmp_reduction_barrier_gather_bb ((int)1)
6499 #define kmp_reduction_barrier_release_bb ((int)1)
6500 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6501 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6502 #endif // KMP_FAST_REDUCTION_BARRIER
6503   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6504     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6505     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6506     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6507     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6508 #if KMP_FAST_REDUCTION_BARRIER
6509     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6510       // lin_64 ): hyper,1
6511       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6512       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6513       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6514       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6515     }
6516 #endif // KMP_FAST_REDUCTION_BARRIER
6517   }
6518 #if KMP_FAST_REDUCTION_BARRIER
6519 #undef kmp_reduction_barrier_release_pat
6520 #undef kmp_reduction_barrier_gather_pat
6521 #undef kmp_reduction_barrier_release_bb
6522 #undef kmp_reduction_barrier_gather_bb
6523 #endif // KMP_FAST_REDUCTION_BARRIER
6524 #if KMP_MIC_SUPPORTED
6525   if (__kmp_mic_type == mic2) { // KNC
6526     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6527     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6528     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6529         1; // forkjoin release
6530     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6531     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6532   }
6533 #if KMP_FAST_REDUCTION_BARRIER
6534   if (__kmp_mic_type == mic2) { // KNC
6535     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6536     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6537   }
6538 #endif // KMP_FAST_REDUCTION_BARRIER
6539 #endif // KMP_MIC_SUPPORTED
6540 
6541 // From KMP_CHECKS initialization
6542 #ifdef KMP_DEBUG
6543   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6544 #else
6545   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6546 #endif
6547 
6548   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6549   __kmp_foreign_tp = TRUE;
6550 
6551   __kmp_global.g.g_dynamic = FALSE;
6552   __kmp_global.g.g_dynamic_mode = dynamic_default;
6553 
6554   __kmp_env_initialize(NULL);
6555 
6556 // Print all messages in message catalog for testing purposes.
6557 #ifdef KMP_DEBUG
6558   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6559   if (__kmp_str_match_true(val)) {
6560     kmp_str_buf_t buffer;
6561     __kmp_str_buf_init(&buffer);
6562     __kmp_i18n_dump_catalog(&buffer);
6563     __kmp_printf("%s", buffer.str);
6564     __kmp_str_buf_free(&buffer);
6565   }
6566   __kmp_env_free(&val);
6567 #endif
6568 
6569   __kmp_threads_capacity =
6570       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6571   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6572   __kmp_tp_capacity = __kmp_default_tp_capacity(
6573       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6574 
6575   // If the library is shut down properly, both pools must be NULL. Just in
6576   // case, set them to NULL -- some memory may leak, but subsequent code will
6577   // work even if pools are not freed.
6578   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6579   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6580   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6581   __kmp_thread_pool = NULL;
6582   __kmp_thread_pool_insert_pt = NULL;
6583   __kmp_team_pool = NULL;
6584 
6585   /* Allocate all of the variable sized records */
6586   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6587    * expandable */
6588   /* Since allocation is cache-aligned, just add extra padding at the end */
6589   size =
6590       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6591       CACHE_LINE;
6592   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6593   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6594                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6595 
6596   /* init thread counts */
6597   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6598                    0); // Asserts fail if the library is reinitializing and
6599   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6600   __kmp_all_nth = 0;
6601   __kmp_nth = 0;
6602 
6603   /* setup the uber master thread and hierarchy */
6604   gtid = __kmp_register_root(TRUE);
6605   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6606   KMP_ASSERT(KMP_UBER_GTID(gtid));
6607   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6608 
6609   KMP_MB(); /* Flush all pending memory write invalidates.  */
6610 
6611   __kmp_common_initialize();
6612 
6613 #if KMP_OS_UNIX
6614   /* invoke the child fork handler */
6615   __kmp_register_atfork();
6616 #endif
6617 
6618 #if !defined KMP_DYNAMIC_LIB
6619   {
6620     /* Invoke the exit handler when the program finishes, only for static
6621        library. For dynamic library, we already have _fini and DllMain. */
6622     int rc = atexit(__kmp_internal_end_atexit);
6623     if (rc != 0) {
6624       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6625                   __kmp_msg_null);
6626     }
6627   }
6628 #endif
6629 
6630 #if KMP_HANDLE_SIGNALS
6631 #if KMP_OS_UNIX
6632   /* NOTE: make sure that this is called before the user installs their own
6633      signal handlers so that the user handlers are called first. this way they
6634      can return false, not call our handler, avoid terminating the library, and
6635      continue execution where they left off. */
6636   __kmp_install_signals(FALSE);
6637 #endif /* KMP_OS_UNIX */
6638 #if KMP_OS_WINDOWS
6639   __kmp_install_signals(TRUE);
6640 #endif /* KMP_OS_WINDOWS */
6641 #endif
6642 
6643   /* we have finished the serial initialization */
6644   __kmp_init_counter++;
6645 
6646   __kmp_init_serial = TRUE;
6647 
6648   if (__kmp_settings) {
6649     __kmp_env_print();
6650   }
6651 
6652 #if OMP_40_ENABLED
6653   if (__kmp_display_env || __kmp_display_env_verbose) {
6654     __kmp_env_print_2();
6655   }
6656 #endif // OMP_40_ENABLED
6657 
6658 #if OMPT_SUPPORT
6659   ompt_post_init();
6660 #endif
6661 
6662   KMP_MB();
6663 
6664   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6665 }
6666 
6667 void __kmp_serial_initialize(void) {
6668   if (__kmp_init_serial) {
6669     return;
6670   }
6671   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6672   if (__kmp_init_serial) {
6673     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6674     return;
6675   }
6676   __kmp_do_serial_initialize();
6677   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678 }
6679 
6680 static void __kmp_do_middle_initialize(void) {
6681   int i, j;
6682   int prev_dflt_team_nth;
6683 
6684   if (!__kmp_init_serial) {
6685     __kmp_do_serial_initialize();
6686   }
6687 
6688   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6689 
6690   // Save the previous value for the __kmp_dflt_team_nth so that
6691   // we can avoid some reinitialization if it hasn't changed.
6692   prev_dflt_team_nth = __kmp_dflt_team_nth;
6693 
6694 #if KMP_AFFINITY_SUPPORTED
6695   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6696   // number of cores on the machine.
6697   __kmp_affinity_initialize();
6698 
6699   // Run through the __kmp_threads array and set the affinity mask
6700   // for each root thread that is currently registered with the RTL.
6701   for (i = 0; i < __kmp_threads_capacity; i++) {
6702     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6703       __kmp_affinity_set_init_mask(i, TRUE);
6704     }
6705   }
6706 #endif /* KMP_AFFINITY_SUPPORTED */
6707 
6708   KMP_ASSERT(__kmp_xproc > 0);
6709   if (__kmp_avail_proc == 0) {
6710     __kmp_avail_proc = __kmp_xproc;
6711   }
6712 
6713   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6714   // correct them now
6715   j = 0;
6716   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6717     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6718         __kmp_avail_proc;
6719     j++;
6720   }
6721 
6722   if (__kmp_dflt_team_nth == 0) {
6723 #ifdef KMP_DFLT_NTH_CORES
6724     // Default #threads = #cores
6725     __kmp_dflt_team_nth = __kmp_ncores;
6726     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6727                   "__kmp_ncores (%d)\n",
6728                   __kmp_dflt_team_nth));
6729 #else
6730     // Default #threads = #available OS procs
6731     __kmp_dflt_team_nth = __kmp_avail_proc;
6732     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6733                   "__kmp_avail_proc(%d)\n",
6734                   __kmp_dflt_team_nth));
6735 #endif /* KMP_DFLT_NTH_CORES */
6736   }
6737 
6738   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6739     __kmp_dflt_team_nth = KMP_MIN_NTH;
6740   }
6741   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6742     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6743   }
6744 
6745   // There's no harm in continuing if the following check fails,
6746   // but it indicates an error in the previous logic.
6747   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6748 
6749   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6750     // Run through the __kmp_threads array and set the num threads icv for each
6751     // root thread that is currently registered with the RTL (which has not
6752     // already explicitly set its nthreads-var with a call to
6753     // omp_set_num_threads()).
6754     for (i = 0; i < __kmp_threads_capacity; i++) {
6755       kmp_info_t *thread = __kmp_threads[i];
6756       if (thread == NULL)
6757         continue;
6758       if (thread->th.th_current_task->td_icvs.nproc != 0)
6759         continue;
6760 
6761       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6762     }
6763   }
6764   KA_TRACE(
6765       20,
6766       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6767        __kmp_dflt_team_nth));
6768 
6769 #ifdef KMP_ADJUST_BLOCKTIME
6770   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6771   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6772     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6773     if (__kmp_nth > __kmp_avail_proc) {
6774       __kmp_zero_bt = TRUE;
6775     }
6776   }
6777 #endif /* KMP_ADJUST_BLOCKTIME */
6778 
6779   /* we have finished middle initialization */
6780   TCW_SYNC_4(__kmp_init_middle, TRUE);
6781 
6782   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6783 }
6784 
6785 void __kmp_middle_initialize(void) {
6786   if (__kmp_init_middle) {
6787     return;
6788   }
6789   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6790   if (__kmp_init_middle) {
6791     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6792     return;
6793   }
6794   __kmp_do_middle_initialize();
6795   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6796 }
6797 
6798 void __kmp_parallel_initialize(void) {
6799   int gtid = __kmp_entry_gtid(); // this might be a new root
6800 
6801   /* synchronize parallel initialization (for sibling) */
6802   if (TCR_4(__kmp_init_parallel))
6803     return;
6804   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6805   if (TCR_4(__kmp_init_parallel)) {
6806     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6807     return;
6808   }
6809 
6810   /* TODO reinitialization after we have already shut down */
6811   if (TCR_4(__kmp_global.g.g_done)) {
6812     KA_TRACE(
6813         10,
6814         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6815     __kmp_infinite_loop();
6816   }
6817 
6818   /* jc: The lock __kmp_initz_lock is already held, so calling
6819      __kmp_serial_initialize would cause a deadlock.  So we call
6820      __kmp_do_serial_initialize directly. */
6821   if (!__kmp_init_middle) {
6822     __kmp_do_middle_initialize();
6823   }
6824 
6825   /* begin initialization */
6826   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6827   KMP_ASSERT(KMP_UBER_GTID(gtid));
6828 
6829 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6830   // Save the FP control regs.
6831   // Worker threads will set theirs to these values at thread startup.
6832   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6833   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6834   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6835 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6836 
6837 #if KMP_OS_UNIX
6838 #if KMP_HANDLE_SIGNALS
6839   /*  must be after __kmp_serial_initialize  */
6840   __kmp_install_signals(TRUE);
6841 #endif
6842 #endif
6843 
6844   __kmp_suspend_initialize();
6845 
6846 #if defined(USE_LOAD_BALANCE)
6847   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6848     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6849   }
6850 #else
6851   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6852     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6853   }
6854 #endif
6855 
6856   if (__kmp_version) {
6857     __kmp_print_version_2();
6858   }
6859 
6860   /* we have finished parallel initialization */
6861   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6862 
6863   KMP_MB();
6864   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6865 
6866   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6867 }
6868 
6869 /* ------------------------------------------------------------------------ */
6870 
6871 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6872                                    kmp_team_t *team) {
6873   kmp_disp_t *dispatch;
6874 
6875   KMP_MB();
6876 
6877   /* none of the threads have encountered any constructs, yet. */
6878   this_thr->th.th_local.this_construct = 0;
6879 #if KMP_CACHE_MANAGE
6880   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6881 #endif /* KMP_CACHE_MANAGE */
6882   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6883   KMP_DEBUG_ASSERT(dispatch);
6884   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6885   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6886   // this_thr->th.th_info.ds.ds_tid ] );
6887 
6888   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6889 #if OMP_45_ENABLED
6890   dispatch->th_doacross_buf_idx =
6891       0; /* reset the doacross dispatch buffer counter */
6892 #endif
6893   if (__kmp_env_consistency_check)
6894     __kmp_push_parallel(gtid, team->t.t_ident);
6895 
6896   KMP_MB(); /* Flush all pending memory write invalidates.  */
6897 }
6898 
6899 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6900                                   kmp_team_t *team) {
6901   if (__kmp_env_consistency_check)
6902     __kmp_pop_parallel(gtid, team->t.t_ident);
6903 
6904   __kmp_finish_implicit_task(this_thr);
6905 }
6906 
6907 int __kmp_invoke_task_func(int gtid) {
6908   int rc;
6909   int tid = __kmp_tid_from_gtid(gtid);
6910   kmp_info_t *this_thr = __kmp_threads[gtid];
6911   kmp_team_t *team = this_thr->th.th_team;
6912 
6913   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6914 #if USE_ITT_BUILD
6915   if (__itt_stack_caller_create_ptr) {
6916     __kmp_itt_stack_callee_enter(
6917         (__itt_caller)
6918             team->t.t_stack_id); // inform ittnotify about entering user's code
6919   }
6920 #endif /* USE_ITT_BUILD */
6921 #if INCLUDE_SSC_MARKS
6922   SSC_MARK_INVOKING();
6923 #endif
6924 
6925 #if OMPT_SUPPORT
6926   void *dummy;
6927   void **exit_runtime_p;
6928   ompt_task_id_t my_task_id;
6929   ompt_parallel_id_t my_parallel_id;
6930 
6931   if (ompt_enabled) {
6932     exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]
6933                            .ompt_task_info.frame.exit_runtime_frame);
6934   } else {
6935     exit_runtime_p = &dummy;
6936   }
6937 
6938 #if OMPT_TRACE
6939   my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6940   my_parallel_id = team->t.ompt_team_info.parallel_id;
6941   if (ompt_enabled &&
6942       ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6943     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(my_parallel_id,
6944                                                                  my_task_id);
6945   }
6946 #endif
6947 #endif
6948 
6949   {
6950     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6951     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6952     rc =
6953         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6954                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6955 #if OMPT_SUPPORT
6956                                ,
6957                                exit_runtime_p
6958 #endif
6959                                );
6960 #if OMPT_SUPPORT
6961     *exit_runtime_p = NULL;
6962 #endif
6963   }
6964 
6965 #if USE_ITT_BUILD
6966   if (__itt_stack_caller_create_ptr) {
6967     __kmp_itt_stack_callee_leave(
6968         (__itt_caller)
6969             team->t.t_stack_id); // inform ittnotify about leaving user's code
6970   }
6971 #endif /* USE_ITT_BUILD */
6972   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6973 
6974   return rc;
6975 }
6976 
6977 #if OMP_40_ENABLED
6978 void __kmp_teams_master(int gtid) {
6979   // This routine is called by all master threads in teams construct
6980   kmp_info_t *thr = __kmp_threads[gtid];
6981   kmp_team_t *team = thr->th.th_team;
6982   ident_t *loc = team->t.t_ident;
6983   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6984   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6985   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6986   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6987                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6988 // Launch league of teams now, but not let workers execute
6989 // (they hang on fork barrier until next parallel)
6990 #if INCLUDE_SSC_MARKS
6991   SSC_MARK_FORKING();
6992 #endif
6993   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6994 #if OMPT_SUPPORT
6995                   (void *)thr->th.th_teams_microtask, // "unwrapped" task
6996 #endif
6997                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6998                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6999 #if INCLUDE_SSC_MARKS
7000   SSC_MARK_JOINING();
7001 #endif
7002 
7003   // AC: last parameter "1" eliminates join barrier which won't work because
7004   // worker threads are in a fork barrier waiting for more parallel regions
7005   __kmp_join_call(loc, gtid
7006 #if OMPT_SUPPORT
7007                   ,
7008                   fork_context_intel
7009 #endif
7010                   ,
7011                   1);
7012 }
7013 
7014 int __kmp_invoke_teams_master(int gtid) {
7015   kmp_info_t *this_thr = __kmp_threads[gtid];
7016   kmp_team_t *team = this_thr->th.th_team;
7017 #if KMP_DEBUG
7018   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7019     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7020                      (void *)__kmp_teams_master);
7021 #endif
7022   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7023   __kmp_teams_master(gtid);
7024   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7025   return 1;
7026 }
7027 #endif /* OMP_40_ENABLED */
7028 
7029 /* this sets the requested number of threads for the next parallel region
7030    encountered by this team. since this should be enclosed in the forkjoin
7031    critical section it should avoid race conditions with assymmetrical nested
7032    parallelism */
7033 
7034 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7035   kmp_info_t *thr = __kmp_threads[gtid];
7036 
7037   if (num_threads > 0)
7038     thr->th.th_set_nproc = num_threads;
7039 }
7040 
7041 #if OMP_40_ENABLED
7042 
7043 /* this sets the requested number of teams for the teams region and/or
7044    the number of threads for the next parallel region encountered  */
7045 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7046                           int num_threads) {
7047   kmp_info_t *thr = __kmp_threads[gtid];
7048   KMP_DEBUG_ASSERT(num_teams >= 0);
7049   KMP_DEBUG_ASSERT(num_threads >= 0);
7050 
7051   if (num_teams == 0)
7052     num_teams = 1; // default number of teams is 1.
7053   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7054     if (!__kmp_reserve_warn) {
7055       __kmp_reserve_warn = 1;
7056       __kmp_msg(kmp_ms_warning,
7057                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7058                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7059     }
7060     num_teams = __kmp_teams_max_nth;
7061   }
7062   // Set number of teams (number of threads in the outer "parallel" of the
7063   // teams)
7064   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7065 
7066   // Remember the number of threads for inner parallel regions
7067   if (num_threads == 0) {
7068     if (!TCR_4(__kmp_init_middle))
7069       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7070     num_threads = __kmp_avail_proc / num_teams;
7071     if (num_teams * num_threads > __kmp_teams_max_nth) {
7072       // adjust num_threads w/o warning as it is not user setting
7073       num_threads = __kmp_teams_max_nth / num_teams;
7074     }
7075   } else {
7076     if (num_teams * num_threads > __kmp_teams_max_nth) {
7077       int new_threads = __kmp_teams_max_nth / num_teams;
7078       if (!__kmp_reserve_warn) { // user asked for too many threads
7079         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7080         __kmp_msg(kmp_ms_warning,
7081                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7082                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7083       }
7084       num_threads = new_threads;
7085     }
7086   }
7087   thr->th.th_teams_size.nth = num_threads;
7088 }
7089 
7090 // Set the proc_bind var to use in the following parallel region.
7091 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7092   kmp_info_t *thr = __kmp_threads[gtid];
7093   thr->th.th_set_proc_bind = proc_bind;
7094 }
7095 
7096 #endif /* OMP_40_ENABLED */
7097 
7098 /* Launch the worker threads into the microtask. */
7099 
7100 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7101   kmp_info_t *this_thr = __kmp_threads[gtid];
7102 
7103 #ifdef KMP_DEBUG
7104   int f;
7105 #endif /* KMP_DEBUG */
7106 
7107   KMP_DEBUG_ASSERT(team);
7108   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7109   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7110   KMP_MB(); /* Flush all pending memory write invalidates.  */
7111 
7112   team->t.t_construct = 0; /* no single directives seen yet */
7113   team->t.t_ordered.dt.t_value =
7114       0; /* thread 0 enters the ordered section first */
7115 
7116   /* Reset the identifiers on the dispatch buffer */
7117   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7118   if (team->t.t_max_nproc > 1) {
7119     int i;
7120     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7121       team->t.t_disp_buffer[i].buffer_index = i;
7122 #if OMP_45_ENABLED
7123       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7124 #endif
7125     }
7126   } else {
7127     team->t.t_disp_buffer[0].buffer_index = 0;
7128 #if OMP_45_ENABLED
7129     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7130 #endif
7131   }
7132 
7133   KMP_MB(); /* Flush all pending memory write invalidates.  */
7134   KMP_ASSERT(this_thr->th.th_team == team);
7135 
7136 #ifdef KMP_DEBUG
7137   for (f = 0; f < team->t.t_nproc; f++) {
7138     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7139                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7140   }
7141 #endif /* KMP_DEBUG */
7142 
7143   /* release the worker threads so they may begin working */
7144   __kmp_fork_barrier(gtid, 0);
7145 }
7146 
7147 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7148   kmp_info_t *this_thr = __kmp_threads[gtid];
7149 
7150   KMP_DEBUG_ASSERT(team);
7151   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7152   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7153   KMP_MB(); /* Flush all pending memory write invalidates.  */
7154 
7155 /* Join barrier after fork */
7156 
7157 #ifdef KMP_DEBUG
7158   if (__kmp_threads[gtid] &&
7159       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7160     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7161                  __kmp_threads[gtid]);
7162     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7163                  "team->t.t_nproc=%d\n",
7164                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7165                  team->t.t_nproc);
7166     __kmp_print_structure();
7167   }
7168   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7169                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7170 #endif /* KMP_DEBUG */
7171 
7172   __kmp_join_barrier(gtid); /* wait for everyone */
7173 
7174   KMP_MB(); /* Flush all pending memory write invalidates.  */
7175   KMP_ASSERT(this_thr->th.th_team == team);
7176 }
7177 
7178 /* ------------------------------------------------------------------------ */
7179 
7180 #ifdef USE_LOAD_BALANCE
7181 
7182 // Return the worker threads actively spinning in the hot team, if we
7183 // are at the outermost level of parallelism.  Otherwise, return 0.
7184 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7185   int i;
7186   int retval;
7187   kmp_team_t *hot_team;
7188 
7189   if (root->r.r_active) {
7190     return 0;
7191   }
7192   hot_team = root->r.r_hot_team;
7193   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7194     return hot_team->t.t_nproc - 1; // Don't count master thread
7195   }
7196 
7197   // Skip the master thread - it is accounted for elsewhere.
7198   retval = 0;
7199   for (i = 1; i < hot_team->t.t_nproc; i++) {
7200     if (hot_team->t.t_threads[i]->th.th_active) {
7201       retval++;
7202     }
7203   }
7204   return retval;
7205 }
7206 
7207 // Perform an automatic adjustment to the number of
7208 // threads used by the next parallel region.
7209 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7210   int retval;
7211   int pool_active;
7212   int hot_team_active;
7213   int team_curr_active;
7214   int system_active;
7215 
7216   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7217                 set_nproc));
7218   KMP_DEBUG_ASSERT(root);
7219   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7220                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7221   KMP_DEBUG_ASSERT(set_nproc > 1);
7222 
7223   if (set_nproc == 1) {
7224     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7225     return 1;
7226   }
7227 
7228   // Threads that are active in the thread pool, active in the hot team for this
7229   // particular root (if we are at the outer par level), and the currently
7230   // executing thread (to become the master) are available to add to the new
7231   // team, but are currently contributing to the system load, and must be
7232   // accounted for.
7233   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7234   hot_team_active = __kmp_active_hot_team_nproc(root);
7235   team_curr_active = pool_active + hot_team_active + 1;
7236 
7237   // Check the system load.
7238   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7239   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7240                 "hot team active = %d\n",
7241                 system_active, pool_active, hot_team_active));
7242 
7243   if (system_active < 0) {
7244     // There was an error reading the necessary info from /proc, so use the
7245     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7246     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7247     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7248     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7249 
7250     // Make this call behave like the thread limit algorithm.
7251     retval = __kmp_avail_proc - __kmp_nth +
7252              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7253     if (retval > set_nproc) {
7254       retval = set_nproc;
7255     }
7256     if (retval < KMP_MIN_NTH) {
7257       retval = KMP_MIN_NTH;
7258     }
7259 
7260     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7261                   retval));
7262     return retval;
7263   }
7264 
7265   // There is a slight delay in the load balance algorithm in detecting new
7266   // running procs. The real system load at this instant should be at least as
7267   // large as the #active omp thread that are available to add to the team.
7268   if (system_active < team_curr_active) {
7269     system_active = team_curr_active;
7270   }
7271   retval = __kmp_avail_proc - system_active + team_curr_active;
7272   if (retval > set_nproc) {
7273     retval = set_nproc;
7274   }
7275   if (retval < KMP_MIN_NTH) {
7276     retval = KMP_MIN_NTH;
7277   }
7278 
7279   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7280   return retval;
7281 } // __kmp_load_balance_nproc()
7282 
7283 #endif /* USE_LOAD_BALANCE */
7284 
7285 /* ------------------------------------------------------------------------ */
7286 
7287 /* NOTE: this is called with the __kmp_init_lock held */
7288 void __kmp_cleanup(void) {
7289   int f;
7290 
7291   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7292 
7293   if (TCR_4(__kmp_init_parallel)) {
7294 #if KMP_HANDLE_SIGNALS
7295     __kmp_remove_signals();
7296 #endif
7297     TCW_4(__kmp_init_parallel, FALSE);
7298   }
7299 
7300   if (TCR_4(__kmp_init_middle)) {
7301 #if KMP_AFFINITY_SUPPORTED
7302     __kmp_affinity_uninitialize();
7303 #endif /* KMP_AFFINITY_SUPPORTED */
7304     __kmp_cleanup_hierarchy();
7305     TCW_4(__kmp_init_middle, FALSE);
7306   }
7307 
7308   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7309 
7310   if (__kmp_init_serial) {
7311     __kmp_runtime_destroy();
7312     __kmp_init_serial = FALSE;
7313   }
7314 
7315   for (f = 0; f < __kmp_threads_capacity; f++) {
7316     if (__kmp_root[f] != NULL) {
7317       __kmp_free(__kmp_root[f]);
7318       __kmp_root[f] = NULL;
7319     }
7320   }
7321   __kmp_free(__kmp_threads);
7322   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7323   // there is no need in freeing __kmp_root.
7324   __kmp_threads = NULL;
7325   __kmp_root = NULL;
7326   __kmp_threads_capacity = 0;
7327 
7328 #if KMP_USE_DYNAMIC_LOCK
7329   __kmp_cleanup_indirect_user_locks();
7330 #else
7331   __kmp_cleanup_user_locks();
7332 #endif
7333 
7334 #if KMP_AFFINITY_SUPPORTED
7335   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7336   __kmp_cpuinfo_file = NULL;
7337 #endif /* KMP_AFFINITY_SUPPORTED */
7338 
7339 #if KMP_USE_ADAPTIVE_LOCKS
7340 #if KMP_DEBUG_ADAPTIVE_LOCKS
7341   __kmp_print_speculative_stats();
7342 #endif
7343 #endif
7344   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7345   __kmp_nested_nth.nth = NULL;
7346   __kmp_nested_nth.size = 0;
7347   __kmp_nested_nth.used = 0;
7348   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7349   __kmp_nested_proc_bind.bind_types = NULL;
7350   __kmp_nested_proc_bind.size = 0;
7351   __kmp_nested_proc_bind.used = 0;
7352 
7353   __kmp_i18n_catclose();
7354 
7355 #if KMP_STATS_ENABLED
7356   __kmp_stats_fini();
7357 #endif
7358 
7359   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7360 }
7361 
7362 /* ------------------------------------------------------------------------ */
7363 
7364 int __kmp_ignore_mppbeg(void) {
7365   char *env;
7366 
7367   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7368     if (__kmp_str_match_false(env))
7369       return FALSE;
7370   }
7371   // By default __kmpc_begin() is no-op.
7372   return TRUE;
7373 }
7374 
7375 int __kmp_ignore_mppend(void) {
7376   char *env;
7377 
7378   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7379     if (__kmp_str_match_false(env))
7380       return FALSE;
7381   }
7382   // By default __kmpc_end() is no-op.
7383   return TRUE;
7384 }
7385 
7386 void __kmp_internal_begin(void) {
7387   int gtid;
7388   kmp_root_t *root;
7389 
7390   /* this is a very important step as it will register new sibling threads
7391      and assign these new uber threads a new gtid */
7392   gtid = __kmp_entry_gtid();
7393   root = __kmp_threads[gtid]->th.th_root;
7394   KMP_ASSERT(KMP_UBER_GTID(gtid));
7395 
7396   if (root->r.r_begin)
7397     return;
7398   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7399   if (root->r.r_begin) {
7400     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7401     return;
7402   }
7403 
7404   root->r.r_begin = TRUE;
7405 
7406   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7407 }
7408 
7409 /* ------------------------------------------------------------------------ */
7410 
7411 void __kmp_user_set_library(enum library_type arg) {
7412   int gtid;
7413   kmp_root_t *root;
7414   kmp_info_t *thread;
7415 
7416   /* first, make sure we are initialized so we can get our gtid */
7417 
7418   gtid = __kmp_entry_gtid();
7419   thread = __kmp_threads[gtid];
7420 
7421   root = thread->th.th_root;
7422 
7423   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7424                 library_serial));
7425   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7426                                   thread */
7427     KMP_WARNING(SetLibraryIncorrectCall);
7428     return;
7429   }
7430 
7431   switch (arg) {
7432   case library_serial:
7433     thread->th.th_set_nproc = 0;
7434     set__nproc(thread, 1);
7435     break;
7436   case library_turnaround:
7437     thread->th.th_set_nproc = 0;
7438     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7439                                            : __kmp_dflt_team_nth_ub);
7440     break;
7441   case library_throughput:
7442     thread->th.th_set_nproc = 0;
7443     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7444                                            : __kmp_dflt_team_nth_ub);
7445     break;
7446   default:
7447     KMP_FATAL(UnknownLibraryType, arg);
7448   }
7449 
7450   __kmp_aux_set_library(arg);
7451 }
7452 
7453 void __kmp_aux_set_stacksize(size_t arg) {
7454   if (!__kmp_init_serial)
7455     __kmp_serial_initialize();
7456 
7457 #if KMP_OS_DARWIN
7458   if (arg & (0x1000 - 1)) {
7459     arg &= ~(0x1000 - 1);
7460     if (arg + 0x1000) /* check for overflow if we round up */
7461       arg += 0x1000;
7462   }
7463 #endif
7464   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7465 
7466   /* only change the default stacksize before the first parallel region */
7467   if (!TCR_4(__kmp_init_parallel)) {
7468     size_t value = arg; /* argument is in bytes */
7469 
7470     if (value < __kmp_sys_min_stksize)
7471       value = __kmp_sys_min_stksize;
7472     else if (value > KMP_MAX_STKSIZE)
7473       value = KMP_MAX_STKSIZE;
7474 
7475     __kmp_stksize = value;
7476 
7477     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7478   }
7479 
7480   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7481 }
7482 
7483 /* set the behaviour of the runtime library */
7484 /* TODO this can cause some odd behaviour with sibling parallelism... */
7485 void __kmp_aux_set_library(enum library_type arg) {
7486   __kmp_library = arg;
7487 
7488   switch (__kmp_library) {
7489   case library_serial: {
7490     KMP_INFORM(LibraryIsSerial);
7491     (void)__kmp_change_library(TRUE);
7492   } break;
7493   case library_turnaround:
7494     (void)__kmp_change_library(TRUE);
7495     break;
7496   case library_throughput:
7497     (void)__kmp_change_library(FALSE);
7498     break;
7499   default:
7500     KMP_FATAL(UnknownLibraryType, arg);
7501   }
7502 }
7503 
7504 /* ------------------------------------------------------------------------ */
7505 
7506 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7507   int blocktime = arg; /* argument is in milliseconds */
7508 #if KMP_USE_MONITOR
7509   int bt_intervals;
7510 #endif
7511   int bt_set;
7512 
7513   __kmp_save_internal_controls(thread);
7514 
7515   /* Normalize and set blocktime for the teams */
7516   if (blocktime < KMP_MIN_BLOCKTIME)
7517     blocktime = KMP_MIN_BLOCKTIME;
7518   else if (blocktime > KMP_MAX_BLOCKTIME)
7519     blocktime = KMP_MAX_BLOCKTIME;
7520 
7521   set__blocktime_team(thread->th.th_team, tid, blocktime);
7522   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7523 
7524 #if KMP_USE_MONITOR
7525   /* Calculate and set blocktime intervals for the teams */
7526   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7527 
7528   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7529   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7530 #endif
7531 
7532   /* Set whether blocktime has been set to "TRUE" */
7533   bt_set = TRUE;
7534 
7535   set__bt_set_team(thread->th.th_team, tid, bt_set);
7536   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7537 #if KMP_USE_MONITOR
7538   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7539                 "bt_intervals=%d, monitor_updates=%d\n",
7540                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7541                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7542                 __kmp_monitor_wakeups));
7543 #else
7544   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7545                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7546                 thread->th.th_team->t.t_id, tid, blocktime));
7547 #endif
7548 }
7549 
7550 void __kmp_aux_set_defaults(char const *str, int len) {
7551   if (!__kmp_init_serial) {
7552     __kmp_serial_initialize();
7553   }
7554   __kmp_env_initialize(str);
7555 
7556   if (__kmp_settings
7557 #if OMP_40_ENABLED
7558       || __kmp_display_env || __kmp_display_env_verbose
7559 #endif // OMP_40_ENABLED
7560       ) {
7561     __kmp_env_print();
7562   }
7563 } // __kmp_aux_set_defaults
7564 
7565 /* ------------------------------------------------------------------------ */
7566 /* internal fast reduction routines */
7567 
7568 PACKED_REDUCTION_METHOD_T
7569 __kmp_determine_reduction_method(
7570     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7571     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7572     kmp_critical_name *lck) {
7573 
7574   // Default reduction method: critical construct ( lck != NULL, like in current
7575   // PAROPT )
7576   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7577   // can be selected by RTL
7578   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7579   // can be selected by RTL
7580   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7581   // among generated by PAROPT.
7582 
7583   PACKED_REDUCTION_METHOD_T retval;
7584 
7585   int team_size;
7586 
7587   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7588   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7589 
7590 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7591   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7592 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7593 
7594   retval = critical_reduce_block;
7595 
7596   // another choice of getting a team size (with 1 dynamic deference) is slower
7597   team_size = __kmp_get_team_num_threads(global_tid);
7598   if (team_size == 1) {
7599 
7600     retval = empty_reduce_block;
7601 
7602   } else {
7603 
7604     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7605     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7606 
7607 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7608 
7609 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7610     KMP_OS_DARWIN
7611 
7612     int teamsize_cutoff = 4;
7613 
7614 #if KMP_MIC_SUPPORTED
7615     if (__kmp_mic_type != non_mic) {
7616       teamsize_cutoff = 8;
7617     }
7618 #endif
7619     if (tree_available) {
7620       if (team_size <= teamsize_cutoff) {
7621         if (atomic_available) {
7622           retval = atomic_reduce_block;
7623         }
7624       } else {
7625         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7626       }
7627     } else if (atomic_available) {
7628       retval = atomic_reduce_block;
7629     }
7630 #else
7631 #error "Unknown or unsupported OS"
7632 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7633 // KMP_OS_DARWIN
7634 
7635 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7636 
7637 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7638 
7639     // basic tuning
7640 
7641     if (atomic_available) {
7642       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7643         retval = atomic_reduce_block;
7644       }
7645     } // otherwise: use critical section
7646 
7647 #elif KMP_OS_DARWIN
7648 
7649     if (atomic_available && (num_vars <= 3)) {
7650       retval = atomic_reduce_block;
7651     } else if (tree_available) {
7652       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7653           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7654         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7655       }
7656     } // otherwise: use critical section
7657 
7658 #else
7659 #error "Unknown or unsupported OS"
7660 #endif
7661 
7662 #else
7663 #error "Unknown or unsupported architecture"
7664 #endif
7665   }
7666 
7667   // KMP_FORCE_REDUCTION
7668 
7669   // If the team is serialized (team_size == 1), ignore the forced reduction
7670   // method and stay with the unsynchronized method (empty_reduce_block)
7671   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7672       team_size != 1) {
7673 
7674     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7675 
7676     int atomic_available, tree_available;
7677 
7678     switch ((forced_retval = __kmp_force_reduction_method)) {
7679     case critical_reduce_block:
7680       KMP_ASSERT(lck); // lck should be != 0
7681       break;
7682 
7683     case atomic_reduce_block:
7684       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7685       if (!atomic_available) {
7686         KMP_WARNING(RedMethodNotSupported, "atomic");
7687         forced_retval = critical_reduce_block;
7688       }
7689       break;
7690 
7691     case tree_reduce_block:
7692       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7693       if (!tree_available) {
7694         KMP_WARNING(RedMethodNotSupported, "tree");
7695         forced_retval = critical_reduce_block;
7696       } else {
7697 #if KMP_FAST_REDUCTION_BARRIER
7698         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7699 #endif
7700       }
7701       break;
7702 
7703     default:
7704       KMP_ASSERT(0); // "unsupported method specified"
7705     }
7706 
7707     retval = forced_retval;
7708   }
7709 
7710   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7711 
7712 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7713 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7714 
7715   return (retval);
7716 }
7717 
7718 // this function is for testing set/get/determine reduce method
7719 kmp_int32 __kmp_get_reduce_method(void) {
7720   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7721 }
7722