1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 #include "kmp_dispatch.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 
36 /* these are temporary issues to be dealt with */
37 #define KMP_USE_PRCTL 0
38 
39 #if KMP_OS_WINDOWS
40 #include <process.h>
41 #endif
42 
43 #include "tsan_annotations.h"
44 
45 #if defined(KMP_GOMP_COMPAT)
46 char const __kmp_version_alt_comp[] =
47     KMP_VERSION_PREFIX "alternative compiler support: yes";
48 #endif /* defined(KMP_GOMP_COMPAT) */
49 
50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
51 #if OMP_50_ENABLED
52                                                         "5.0 (201611)";
53 #elif OMP_45_ENABLED
54                                                         "4.5 (201511)";
55 #elif OMP_40_ENABLED
56                                                         "4.0 (201307)";
57 #else
58                                                         "3.1 (201107)";
59 #endif
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109   int i;
110   kmp_info_t **other_threads;
111   size_t stack_data;
112   char *stack_addr;
113   size_t stack_size;
114   char *stack_base;
115 
116   KA_TRACE(
117       1000,
118       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
119        __kmp_nth, __kmp_all_nth));
120 
121   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124      __kmp_init_gtid for this to work. */
125 
126   if (!TCR_4(__kmp_init_gtid))
127     return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130   if (TCR_4(__kmp_gtid_mode) >= 3) {
131     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132     return __kmp_gtid;
133   }
134 #endif
135   if (TCR_4(__kmp_gtid_mode) >= 2) {
136     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137     return __kmp_gtid_get_specific();
138   }
139   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141   stack_addr = (char *)&stack_data;
142   other_threads = __kmp_threads;
143 
144   /* ATT: The code below is a source of potential bugs due to unsynchronized
145      access to __kmp_threads array. For example:
146      1. Current thread loads other_threads[i] to thr and checks it, it is
147         non-NULL.
148      2. Current thread is suspended by OS.
149      3. Another thread unregisters and finishes (debug versions of free()
150         may fill memory with something like 0xEF).
151      4. Current thread is resumed.
152      5. Current thread reads junk from *thr.
153      TODO: Fix it.  --ln  */
154 
155   for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158     if (!thr)
159       continue;
160 
161     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164     /* stack grows down -- search through all of the active threads */
165 
166     if (stack_addr <= stack_base) {
167       size_t stack_diff = stack_base - stack_addr;
168 
169       if (stack_diff <= stack_size) {
170         /* The only way we can be closer than the allocated */
171         /* stack size is if we are running on this thread. */
172         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173         return i;
174       }
175     }
176   }
177 
178   /* get specific to try and determine our gtid */
179   KA_TRACE(1000,
180            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181             "thread, using TLS\n"));
182   i = __kmp_gtid_get_specific();
183 
184   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
185 
186   /* if we havn't been assigned a gtid, then return code */
187   if (i < 0)
188     return i;
189 
190   /* dynamically updated stack window for uber threads to avoid get_specific
191      call */
192   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193     KMP_FATAL(StackOverflow, i);
194   }
195 
196   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197   if (stack_addr > stack_base) {
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201                 stack_base);
202   } else {
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             stack_base - stack_addr);
205   }
206 
207   /* Reprint stack bounds for ubermaster since they have been refined */
208   if (__kmp_storage_map) {
209     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212                                  other_threads[i]->th.th_info.ds.ds_stacksize,
213                                  "th_%d stack (refinement)", i);
214   }
215   return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219   int gtid;
220 
221   if (!__kmp_init_serial) {
222     gtid = KMP_GTID_DNE;
223   } else
224 #ifdef KMP_TDATA_GTID
225       if (TCR_4(__kmp_gtid_mode) >= 3) {
226     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227     gtid = __kmp_gtid;
228   } else
229 #endif
230       if (TCR_4(__kmp_gtid_mode) >= 2) {
231     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232     gtid = __kmp_gtid_get_specific();
233   } else {
234     KA_TRACE(1000,
235              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236     gtid = __kmp_get_global_thread_id();
237   }
238 
239   /* we must be a new uber master sibling thread */
240   if (gtid == KMP_GTID_DNE) {
241     KA_TRACE(10,
242              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243               "Registering a new gtid.\n"));
244     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245     if (!__kmp_init_serial) {
246       __kmp_do_serial_initialize();
247       gtid = __kmp_gtid_get_specific();
248     } else {
249       gtid = __kmp_register_root(FALSE);
250     }
251     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253   }
254 
255   KMP_DEBUG_ASSERT(gtid >= 0);
256 
257   return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262   int f;
263   char *stack_beg = NULL;
264   char *stack_end = NULL;
265   int gtid;
266 
267   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268   if (__kmp_storage_map) {
269     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272     gtid = __kmp_gtid_from_thread(th);
273 
274     if (gtid == KMP_GTID_MONITOR) {
275       __kmp_print_storage_map_gtid(
276           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277           "th_%s stack (%s)", "mon",
278           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279     } else {
280       __kmp_print_storage_map_gtid(
281           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282           "th_%d stack (%s)", gtid,
283           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284     }
285   }
286 
287   /* No point in checking ubermaster threads since they use refinement and
288    * cannot overlap */
289   gtid = __kmp_gtid_from_thread(th);
290   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291     KA_TRACE(10,
292              ("__kmp_check_stack_overlap: performing extensive checking\n"));
293     if (stack_beg == NULL) {
294       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296     }
297 
298     for (f = 0; f < __kmp_threads_capacity; f++) {
299       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301       if (f_th && f_th != th) {
302         char *other_stack_end =
303             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304         char *other_stack_beg =
305             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309           /* Print the other stack values before the abort */
310           if (__kmp_storage_map)
311             __kmp_print_storage_map_gtid(
312                 -1, other_stack_beg, other_stack_end,
313                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317                       __kmp_msg_null);
318         }
319       }
320     }
321   }
322   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328   static int done = FALSE;
329 
330   while (!done) {
331     KMP_YIELD(1);
332   }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338                                   char const *format, ...) {
339   char buffer[MAX_MESSAGE];
340   va_list ap;
341 
342   va_start(ap, format);
343   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344                p2, (unsigned long)size, format);
345   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346   __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348   int node;
349   if (gtid >= 0) {
350     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351       if (__kmp_storage_map_verbose) {
352         node = __kmp_get_host_node(p1);
353         if (node < 0) /* doesn't work, so don't try this next time */
354           __kmp_storage_map_verbose = FALSE;
355         else {
356           char *last;
357           int lastNode;
358           int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360           const int page_size = KMP_GET_PAGE_SIZE();
361 
362           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364           if (localProc >= 0)
365             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
366                                  localProc >> 1);
367           else
368             __kmp_printf_no_lock("  GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370           /* The more elaborate format is disabled for now because of the prctl
371            * hanging bug. */
372           do {
373             last = p1;
374             lastNode = node;
375             /* This loop collates adjacent pages with the same host node. */
376             do {
377               (char *)p1 += page_size;
378             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
380                                  lastNode);
381           } while (p1 <= p2);
382 #else
383           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
384                                (char *)p1 + (page_size - 1),
385                                __kmp_get_host_node(p1));
386           if (p1 < p2) {
387             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
388                                  (char *)p2 + (page_size - 1),
389                                  __kmp_get_host_node(p2));
390           }
391 #endif
392         }
393       }
394     } else
395       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
396   }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402   char buffer[MAX_MESSAGE];
403   va_list ap;
404 
405   if (__kmp_generate_warnings == kmp_warnings_off) {
406     return;
407   }
408 
409   va_start(ap, format);
410 
411   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413   __kmp_vprintf(kmp_err, buffer, ap);
414   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416   va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420   // Later threads may stall here, but that's ok because abort() will kill them.
421   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423   if (__kmp_debug_buf) {
424     __kmp_dump_debug_buffer();
425   }
426 
427   if (KMP_OS_WINDOWS) {
428     // Let other threads know of abnormal termination and prevent deadlock
429     // if abort happened during library initialization or shutdown
430     __kmp_global.g.g_abort = SIGABRT;
431 
432     /* On Windows* OS by default abort() causes pop-up error box, which stalls
433        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434        boxes. _set_abort_behavior() works well, but this function is not
435        available in VS7 (this is not problem for DLL, but it is a problem for
436        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437        help, at least in some versions of MS C RTL.
438 
439        It seems following sequence is the only way to simulate abort() and
440        avoid pop-up error box. */
441     raise(SIGABRT);
442     _exit(3); // Just in case, if signal ignored, exit anyway.
443   } else {
444     abort();
445   }
446 
447   __kmp_infinite_loop();
448   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453   // TODO: Eliminate g_abort global variable and this function.
454   // In case of abort just call abort(), it will kill all the threads.
455   __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459    that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463                                gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471   __kmp_print_storage_map_gtid(
472       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476                                &thr->th.th_bar[bs_plain_barrier + 1],
477                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478                                gtid);
479 
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483                                gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487                                &thr->th.th_bar[bs_reduction_barrier + 1],
488                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489                                gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494    that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497                                          int team_id, int num_thr) {
498   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                                header, team_id);
501 
502   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503                                &team->t.t_bar[bs_last_barrier],
504                                sizeof(kmp_balign_team_t) * bs_last_barrier,
505                                "%s_%d.t_bar", header, team_id);
506 
507   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508                                &team->t.t_bar[bs_plain_barrier + 1],
509                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510                                header, team_id);
511 
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513                                &team->t.t_bar[bs_forkjoin_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519                                &team->t.t_bar[bs_reduction_barrier + 1],
520                                sizeof(kmp_balign_team_t),
521                                "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524   __kmp_print_storage_map_gtid(
525       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528   __kmp_print_storage_map_gtid(
529       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533                                &team->t.t_disp_buffer[num_disp_buff],
534                                sizeof(dispatch_shared_info_t) * num_disp_buff,
535                                "%s_%d.t_disp_buffer", header, team_id);
536 
537   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
538                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
539                                team_id);
540 }
541 
542 static void __kmp_init_allocator() {
543 #if OMP_50_ENABLED
544   __kmp_init_memkind();
545 #endif
546 }
547 static void __kmp_fini_allocator() {
548 #if OMP_50_ENABLED
549   __kmp_fini_memkind();
550 #endif
551 }
552 
553 /* ------------------------------------------------------------------------ */
554 
555 #if KMP_DYNAMIC_LIB
556 #if KMP_OS_WINDOWS
557 
558 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
559   // TODO: Change to __kmp_break_bootstrap_lock().
560   __kmp_init_bootstrap_lock(lck); // make the lock released
561 }
562 
563 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
564   int i;
565   int thread_count;
566 
567   // PROCESS_DETACH is expected to be called by a thread that executes
568   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
569   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
570   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
571   // threads can be still alive here, although being about to be terminated. The
572   // threads in the array with ds_thread==0 are most suspicious. Actually, it
573   // can be not safe to access the __kmp_threads[].
574 
575   // TODO: does it make sense to check __kmp_roots[] ?
576 
577   // Let's check that there are no other alive threads registered with the OMP
578   // lib.
579   while (1) {
580     thread_count = 0;
581     for (i = 0; i < __kmp_threads_capacity; ++i) {
582       if (!__kmp_threads)
583         continue;
584       kmp_info_t *th = __kmp_threads[i];
585       if (th == NULL)
586         continue;
587       int gtid = th->th.th_info.ds.ds_gtid;
588       if (gtid == gtid_req)
589         continue;
590       if (gtid < 0)
591         continue;
592       DWORD exit_val;
593       int alive = __kmp_is_thread_alive(th, &exit_val);
594       if (alive) {
595         ++thread_count;
596       }
597     }
598     if (thread_count == 0)
599       break; // success
600   }
601 
602   // Assume that I'm alone. Now it might be safe to check and reset locks.
603   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
604   __kmp_reset_lock(&__kmp_forkjoin_lock);
605 #ifdef KMP_DEBUG
606   __kmp_reset_lock(&__kmp_stdio_lock);
607 #endif // KMP_DEBUG
608 }
609 
610 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
611   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
612 
613   switch (fdwReason) {
614 
615   case DLL_PROCESS_ATTACH:
616     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
617 
618     return TRUE;
619 
620   case DLL_PROCESS_DETACH:
621     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
622 
623     if (lpReserved != NULL) {
624       // lpReserved is used for telling the difference:
625       //   lpReserved == NULL when FreeLibrary() was called,
626       //   lpReserved != NULL when the process terminates.
627       // When FreeLibrary() is called, worker threads remain alive. So they will
628       // release the forkjoin lock by themselves. When the process terminates,
629       // worker threads disappear triggering the problem of unreleased forkjoin
630       // lock as described below.
631 
632       // A worker thread can take the forkjoin lock. The problem comes up if
633       // that worker thread becomes dead before it releases the forkjoin lock.
634       // The forkjoin lock remains taken, while the thread executing
635       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
636       // to take the forkjoin lock and will always fail, so that the application
637       // will never finish [normally]. This scenario is possible if
638       // __kmpc_end() has not been executed. It looks like it's not a corner
639       // case, but common cases:
640       // - the main function was compiled by an alternative compiler;
641       // - the main function was compiled by icl but without /Qopenmp
642       //   (application with plugins);
643       // - application terminates by calling C exit(), Fortran CALL EXIT() or
644       //   Fortran STOP.
645       // - alive foreign thread prevented __kmpc_end from doing cleanup.
646       //
647       // This is a hack to work around the problem.
648       // TODO: !!! figure out something better.
649       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
650     }
651 
652     __kmp_internal_end_library(__kmp_gtid_get_specific());
653 
654     return TRUE;
655 
656   case DLL_THREAD_ATTACH:
657     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
658 
659     /* if we want to register new siblings all the time here call
660      * __kmp_get_gtid(); */
661     return TRUE;
662 
663   case DLL_THREAD_DETACH:
664     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
665 
666     __kmp_internal_end_thread(__kmp_gtid_get_specific());
667     return TRUE;
668   }
669 
670   return TRUE;
671 }
672 
673 #endif /* KMP_OS_WINDOWS */
674 #endif /* KMP_DYNAMIC_LIB */
675 
676 /* Change the library type to "status" and return the old type */
677 /* called from within initialization routines where __kmp_initz_lock is held */
678 int __kmp_change_library(int status) {
679   int old_status;
680 
681   old_status = __kmp_yield_init &
682                1; // check whether KMP_LIBRARY=throughput (even init count)
683 
684   if (status) {
685     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
686   } else {
687     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
688   }
689 
690   return old_status; // return previous setting of whether
691   // KMP_LIBRARY=throughput
692 }
693 
694 /* __kmp_parallel_deo -- Wait until it's our turn. */
695 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696   int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698   kmp_team_t *team = __kmp_team_from_gtid(gtid);
699 #endif /* BUILD_PARALLEL_ORDERED */
700 
701   if (__kmp_env_consistency_check) {
702     if (__kmp_threads[gtid]->th.th_root->r.r_active)
703 #if KMP_USE_DYNAMIC_LOCK
704       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
705 #else
706       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
707 #endif
708   }
709 #ifdef BUILD_PARALLEL_ORDERED
710   if (!team->t.t_serialized) {
711     KMP_MB();
712     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
713                    KMP_EQ, NULL);
714     KMP_MB();
715   }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* __kmp_parallel_dxo -- Signal the next task. */
720 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
721   int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723   int tid = __kmp_tid_from_gtid(gtid);
724   kmp_team_t *team = __kmp_team_from_gtid(gtid);
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727   if (__kmp_env_consistency_check) {
728     if (__kmp_threads[gtid]->th.th_root->r.r_active)
729       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
730   }
731 #ifdef BUILD_PARALLEL_ORDERED
732   if (!team->t.t_serialized) {
733     KMP_MB(); /* Flush all pending memory write invalidates.  */
734 
735     /* use the tid of the next thread in this team */
736     /* TODO replace with general release procedure */
737     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
738 
739     KMP_MB(); /* Flush all pending memory write invalidates.  */
740   }
741 #endif /* BUILD_PARALLEL_ORDERED */
742 }
743 
744 /* ------------------------------------------------------------------------ */
745 /* The BARRIER for a SINGLE process section is always explicit   */
746 
747 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
748   int status;
749   kmp_info_t *th;
750   kmp_team_t *team;
751 
752   if (!TCR_4(__kmp_init_parallel))
753     __kmp_parallel_initialize();
754 
755 #if OMP_50_ENABLED
756   __kmp_resume_if_soft_paused();
757 #endif
758 
759   th = __kmp_threads[gtid];
760   team = th->th.th_team;
761   status = 0;
762 
763   th->th.th_ident = id_ref;
764 
765   if (team->t.t_serialized) {
766     status = 1;
767   } else {
768     kmp_int32 old_this = th->th.th_local.this_construct;
769 
770     ++th->th.th_local.this_construct;
771     /* try to set team count to thread count--success means thread got the
772        single block */
773     /* TODO: Should this be acquire or release? */
774     if (team->t.t_construct == old_this) {
775       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
776                                               th->th.th_local.this_construct);
777     }
778 #if USE_ITT_BUILD
779     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
780         KMP_MASTER_GTID(gtid) &&
781 #if OMP_40_ENABLED
782         th->th.th_teams_microtask == NULL &&
783 #endif
784         team->t.t_active_level ==
785             1) { // Only report metadata by master of active team at level 1
786       __kmp_itt_metadata_single(id_ref);
787     }
788 #endif /* USE_ITT_BUILD */
789   }
790 
791   if (__kmp_env_consistency_check) {
792     if (status && push_ws) {
793       __kmp_push_workshare(gtid, ct_psingle, id_ref);
794     } else {
795       __kmp_check_workshare(gtid, ct_psingle, id_ref);
796     }
797   }
798 #if USE_ITT_BUILD
799   if (status) {
800     __kmp_itt_single_start(gtid);
801   }
802 #endif /* USE_ITT_BUILD */
803   return status;
804 }
805 
806 void __kmp_exit_single(int gtid) {
807 #if USE_ITT_BUILD
808   __kmp_itt_single_end(gtid);
809 #endif /* USE_ITT_BUILD */
810   if (__kmp_env_consistency_check)
811     __kmp_pop_workshare(gtid, ct_psingle, NULL);
812 }
813 
814 /* determine if we can go parallel or must use a serialized parallel region and
815  * how many threads we can use
816  * set_nproc is the number of threads requested for the team
817  * returns 0 if we should serialize or only use one thread,
818  * otherwise the number of threads to use
819  * The forkjoin lock is held by the caller. */
820 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
821                                  int master_tid, int set_nthreads
822 #if OMP_40_ENABLED
823                                  ,
824                                  int enter_teams
825 #endif /* OMP_40_ENABLED */
826                                  ) {
827   int capacity;
828   int new_nthreads;
829   KMP_DEBUG_ASSERT(__kmp_init_serial);
830   KMP_DEBUG_ASSERT(root && parent_team);
831 
832   // If dyn-var is set, dynamically adjust the number of desired threads,
833   // according to the method specified by dynamic_mode.
834   new_nthreads = set_nthreads;
835   if (!get__dynamic_2(parent_team, master_tid)) {
836     ;
837   }
838 #ifdef USE_LOAD_BALANCE
839   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
840     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
841     if (new_nthreads == 1) {
842       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
843                     "reservation to 1 thread\n",
844                     master_tid));
845       return 1;
846     }
847     if (new_nthreads < set_nthreads) {
848       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
849                     "reservation to %d threads\n",
850                     master_tid, new_nthreads));
851     }
852   }
853 #endif /* USE_LOAD_BALANCE */
854   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
855     new_nthreads = __kmp_avail_proc - __kmp_nth +
856                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857     if (new_nthreads <= 1) {
858       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
859                     "reservation to 1 thread\n",
860                     master_tid));
861       return 1;
862     }
863     if (new_nthreads < set_nthreads) {
864       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
865                     "reservation to %d threads\n",
866                     master_tid, new_nthreads));
867     } else {
868       new_nthreads = set_nthreads;
869     }
870   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
871     if (set_nthreads > 2) {
872       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
873       new_nthreads = (new_nthreads % set_nthreads) + 1;
874       if (new_nthreads == 1) {
875         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
876                       "reservation to 1 thread\n",
877                       master_tid));
878         return 1;
879       }
880       if (new_nthreads < set_nthreads) {
881         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
882                       "reservation to %d threads\n",
883                       master_tid, new_nthreads));
884       }
885     }
886   } else {
887     KMP_ASSERT(0);
888   }
889 
890   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
891   if (__kmp_nth + new_nthreads -
892           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893       __kmp_max_nth) {
894     int tl_nthreads = __kmp_max_nth - __kmp_nth +
895                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
896     if (tl_nthreads <= 0) {
897       tl_nthreads = 1;
898     }
899 
900     // If dyn-var is false, emit a 1-time warning.
901     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
902       __kmp_reserve_warn = 1;
903       __kmp_msg(kmp_ms_warning,
904                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
905                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
906     }
907     if (tl_nthreads == 1) {
908       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
909                     "reduced reservation to 1 thread\n",
910                     master_tid));
911       return 1;
912     }
913     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
914                   "reservation to %d threads\n",
915                   master_tid, tl_nthreads));
916     new_nthreads = tl_nthreads;
917   }
918 
919   // Respect OMP_THREAD_LIMIT
920   if (root->r.r_cg_nthreads + new_nthreads -
921           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
922       __kmp_cg_max_nth) {
923     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
924                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
925     if (tl_nthreads <= 0) {
926       tl_nthreads = 1;
927     }
928 
929     // If dyn-var is false, emit a 1-time warning.
930     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
931       __kmp_reserve_warn = 1;
932       __kmp_msg(kmp_ms_warning,
933                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
934                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
935     }
936     if (tl_nthreads == 1) {
937       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
938                     "reduced reservation to 1 thread\n",
939                     master_tid));
940       return 1;
941     }
942     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
943                   "reservation to %d threads\n",
944                   master_tid, tl_nthreads));
945     new_nthreads = tl_nthreads;
946   }
947 
948   // Check if the threads array is large enough, or needs expanding.
949   // See comment in __kmp_register_root() about the adjustment if
950   // __kmp_threads[0] == NULL.
951   capacity = __kmp_threads_capacity;
952   if (TCR_PTR(__kmp_threads[0]) == NULL) {
953     --capacity;
954   }
955   if (__kmp_nth + new_nthreads -
956           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
957       capacity) {
958     // Expand the threads array.
959     int slotsRequired = __kmp_nth + new_nthreads -
960                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
961                         capacity;
962     int slotsAdded = __kmp_expand_threads(slotsRequired);
963     if (slotsAdded < slotsRequired) {
964       // The threads array was not expanded enough.
965       new_nthreads -= (slotsRequired - slotsAdded);
966       KMP_ASSERT(new_nthreads >= 1);
967 
968       // If dyn-var is false, emit a 1-time warning.
969       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
970         __kmp_reserve_warn = 1;
971         if (__kmp_tp_cached) {
972           __kmp_msg(kmp_ms_warning,
973                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
974                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
975                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
976         } else {
977           __kmp_msg(kmp_ms_warning,
978                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
979                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
980         }
981       }
982     }
983   }
984 
985 #ifdef KMP_DEBUG
986   if (new_nthreads == 1) {
987     KC_TRACE(10,
988              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
989               "dead roots and rechecking; requested %d threads\n",
990               __kmp_get_gtid(), set_nthreads));
991   } else {
992     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
993                   " %d threads\n",
994                   __kmp_get_gtid(), new_nthreads, set_nthreads));
995   }
996 #endif // KMP_DEBUG
997   return new_nthreads;
998 }
999 
1000 /* Allocate threads from the thread pool and assign them to the new team. We are
1001    assured that there are enough threads available, because we checked on that
1002    earlier within critical section forkjoin */
1003 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1004                                     kmp_info_t *master_th, int master_gtid) {
1005   int i;
1006   int use_hot_team;
1007 
1008   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1009   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1010   KMP_MB();
1011 
1012   /* first, let's setup the master thread */
1013   master_th->th.th_info.ds.ds_tid = 0;
1014   master_th->th.th_team = team;
1015   master_th->th.th_team_nproc = team->t.t_nproc;
1016   master_th->th.th_team_master = master_th;
1017   master_th->th.th_team_serialized = FALSE;
1018   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1019 
1020 /* make sure we are not the optimized hot team */
1021 #if KMP_NESTED_HOT_TEAMS
1022   use_hot_team = 0;
1023   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1024   if (hot_teams) { // hot teams array is not allocated if
1025     // KMP_HOT_TEAMS_MAX_LEVEL=0
1026     int level = team->t.t_active_level - 1; // index in array of hot teams
1027     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1028       if (master_th->th.th_teams_size.nteams > 1) {
1029         ++level; // level was not increased in teams construct for
1030         // team_of_masters
1031       }
1032       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1033           master_th->th.th_teams_level == team->t.t_level) {
1034         ++level; // level was not increased in teams construct for
1035         // team_of_workers before the parallel
1036       } // team->t.t_level will be increased inside parallel
1037     }
1038     if (level < __kmp_hot_teams_max_level) {
1039       if (hot_teams[level].hot_team) {
1040         // hot team has already been allocated for given level
1041         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1042         use_hot_team = 1; // the team is ready to use
1043       } else {
1044         use_hot_team = 0; // AC: threads are not allocated yet
1045         hot_teams[level].hot_team = team; // remember new hot team
1046         hot_teams[level].hot_team_nth = team->t.t_nproc;
1047       }
1048     } else {
1049       use_hot_team = 0;
1050     }
1051   }
1052 #else
1053   use_hot_team = team == root->r.r_hot_team;
1054 #endif
1055   if (!use_hot_team) {
1056 
1057     /* install the master thread */
1058     team->t.t_threads[0] = master_th;
1059     __kmp_initialize_info(master_th, team, 0, master_gtid);
1060 
1061     /* now, install the worker threads */
1062     for (i = 1; i < team->t.t_nproc; i++) {
1063 
1064       /* fork or reallocate a new thread and install it in team */
1065       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1066       team->t.t_threads[i] = thr;
1067       KMP_DEBUG_ASSERT(thr);
1068       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1069       /* align team and thread arrived states */
1070       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1071                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1072                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1073                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1074                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1075                     team->t.t_bar[bs_plain_barrier].b_arrived));
1076 #if OMP_40_ENABLED
1077       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1078       thr->th.th_teams_level = master_th->th.th_teams_level;
1079       thr->th.th_teams_size = master_th->th.th_teams_size;
1080 #endif
1081       { // Initialize threads' barrier data.
1082         int b;
1083         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1084         for (b = 0; b < bs_last_barrier; ++b) {
1085           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1086           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1087 #if USE_DEBUGGER
1088           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1089 #endif
1090         }
1091       }
1092     }
1093 
1094 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1095     __kmp_partition_places(team);
1096 #endif
1097   }
1098 
1099 #if OMP_50_ENABLED
1100   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1101     for (i = 0; i < team->t.t_nproc; i++) {
1102       kmp_info_t *thr = team->t.t_threads[i];
1103       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1104           thr->th.th_prev_level != team->t.t_level) {
1105         team->t.t_display_affinity = 1;
1106         break;
1107       }
1108     }
1109   }
1110 #endif
1111 
1112   KMP_MB();
1113 }
1114 
1115 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1116 // Propagate any changes to the floating point control registers out to the team
1117 // We try to avoid unnecessary writes to the relevant cache line in the team
1118 // structure, so we don't make changes unless they are needed.
1119 inline static void propagateFPControl(kmp_team_t *team) {
1120   if (__kmp_inherit_fp_control) {
1121     kmp_int16 x87_fpu_control_word;
1122     kmp_uint32 mxcsr;
1123 
1124     // Get master values of FPU control flags (both X87 and vector)
1125     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1126     __kmp_store_mxcsr(&mxcsr);
1127     mxcsr &= KMP_X86_MXCSR_MASK;
1128 
1129     // There is no point looking at t_fp_control_saved here.
1130     // If it is TRUE, we still have to update the values if they are different
1131     // from those we now have. If it is FALSE we didn't save anything yet, but
1132     // our objective is the same. We have to ensure that the values in the team
1133     // are the same as those we have.
1134     // So, this code achieves what we need whether or not t_fp_control_saved is
1135     // true. By checking whether the value needs updating we avoid unnecessary
1136     // writes that would put the cache-line into a written state, causing all
1137     // threads in the team to have to read it again.
1138     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1139     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1140     // Although we don't use this value, other code in the runtime wants to know
1141     // whether it should restore them. So we must ensure it is correct.
1142     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1143   } else {
1144     // Similarly here. Don't write to this cache-line in the team structure
1145     // unless we have to.
1146     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1147   }
1148 }
1149 
1150 // Do the opposite, setting the hardware registers to the updated values from
1151 // the team.
1152 inline static void updateHWFPControl(kmp_team_t *team) {
1153   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1154     // Only reset the fp control regs if they have been changed in the team.
1155     // the parallel region that we are exiting.
1156     kmp_int16 x87_fpu_control_word;
1157     kmp_uint32 mxcsr;
1158     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1159     __kmp_store_mxcsr(&mxcsr);
1160     mxcsr &= KMP_X86_MXCSR_MASK;
1161 
1162     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1163       __kmp_clear_x87_fpu_status_word();
1164       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1165     }
1166 
1167     if (team->t.t_mxcsr != mxcsr) {
1168       __kmp_load_mxcsr(&team->t.t_mxcsr);
1169     }
1170   }
1171 }
1172 #else
1173 #define propagateFPControl(x) ((void)0)
1174 #define updateHWFPControl(x) ((void)0)
1175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1176 
1177 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1178                                      int realloc); // forward declaration
1179 
1180 /* Run a parallel region that has been serialized, so runs only in a team of the
1181    single master thread. */
1182 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1183   kmp_info_t *this_thr;
1184   kmp_team_t *serial_team;
1185 
1186   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1187 
1188   /* Skip all this code for autopar serialized loops since it results in
1189      unacceptable overhead */
1190   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1191     return;
1192 
1193   if (!TCR_4(__kmp_init_parallel))
1194     __kmp_parallel_initialize();
1195 
1196 #if OMP_50_ENABLED
1197   __kmp_resume_if_soft_paused();
1198 #endif
1199 
1200   this_thr = __kmp_threads[global_tid];
1201   serial_team = this_thr->th.th_serial_team;
1202 
1203   /* utilize the serialized team held by this thread */
1204   KMP_DEBUG_ASSERT(serial_team);
1205   KMP_MB();
1206 
1207   if (__kmp_tasking_mode != tskm_immediate_exec) {
1208     KMP_DEBUG_ASSERT(
1209         this_thr->th.th_task_team ==
1210         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1211     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1212                      NULL);
1213     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1214                   "team %p, new task_team = NULL\n",
1215                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1216     this_thr->th.th_task_team = NULL;
1217   }
1218 
1219 #if OMP_40_ENABLED
1220   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1221   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1222     proc_bind = proc_bind_false;
1223   } else if (proc_bind == proc_bind_default) {
1224     // No proc_bind clause was specified, so use the current value
1225     // of proc-bind-var for this parallel region.
1226     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1227   }
1228   // Reset for next parallel region
1229   this_thr->th.th_set_proc_bind = proc_bind_default;
1230 #endif /* OMP_40_ENABLED */
1231 
1232 #if OMPT_SUPPORT
1233   ompt_data_t ompt_parallel_data = ompt_data_none;
1234   ompt_data_t *implicit_task_data;
1235   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1236   if (ompt_enabled.enabled &&
1237       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1238 
1239     ompt_task_info_t *parent_task_info;
1240     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1241 
1242     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1243     if (ompt_enabled.ompt_callback_parallel_begin) {
1244       int team_size = 1;
1245 
1246       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1247           &(parent_task_info->task_data), &(parent_task_info->frame),
1248           &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1249           codeptr);
1250     }
1251   }
1252 #endif // OMPT_SUPPORT
1253 
1254   if (this_thr->th.th_team != serial_team) {
1255     // Nested level will be an index in the nested nthreads array
1256     int level = this_thr->th.th_team->t.t_level;
1257 
1258     if (serial_team->t.t_serialized) {
1259       /* this serial team was already used
1260          TODO increase performance by making this locks more specific */
1261       kmp_team_t *new_team;
1262 
1263       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1264 
1265       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1266 #if OMPT_SUPPORT
1267                                      ompt_parallel_data,
1268 #endif
1269 #if OMP_40_ENABLED
1270                                      proc_bind,
1271 #endif
1272                                      &this_thr->th.th_current_task->td_icvs,
1273                                      0 USE_NESTED_HOT_ARG(NULL));
1274       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1275       KMP_ASSERT(new_team);
1276 
1277       /* setup new serialized team and install it */
1278       new_team->t.t_threads[0] = this_thr;
1279       new_team->t.t_parent = this_thr->th.th_team;
1280       serial_team = new_team;
1281       this_thr->th.th_serial_team = serial_team;
1282 
1283       KF_TRACE(
1284           10,
1285           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1286            global_tid, serial_team));
1287 
1288       /* TODO the above breaks the requirement that if we run out of resources,
1289          then we can still guarantee that serialized teams are ok, since we may
1290          need to allocate a new one */
1291     } else {
1292       KF_TRACE(
1293           10,
1294           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1295            global_tid, serial_team));
1296     }
1297 
1298     /* we have to initialize this serial team */
1299     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1300     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1301     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1302     serial_team->t.t_ident = loc;
1303     serial_team->t.t_serialized = 1;
1304     serial_team->t.t_nproc = 1;
1305     serial_team->t.t_parent = this_thr->th.th_team;
1306     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1307     this_thr->th.th_team = serial_team;
1308     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1309 
1310     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1311                   this_thr->th.th_current_task));
1312     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1313     this_thr->th.th_current_task->td_flags.executing = 0;
1314 
1315     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1316 
1317     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1318        implicit task for each serialized task represented by
1319        team->t.t_serialized? */
1320     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1321               &this_thr->th.th_current_task->td_parent->td_icvs);
1322 
1323     // Thread value exists in the nested nthreads array for the next nested
1324     // level
1325     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1326       this_thr->th.th_current_task->td_icvs.nproc =
1327           __kmp_nested_nth.nth[level + 1];
1328     }
1329 
1330 #if OMP_40_ENABLED
1331     if (__kmp_nested_proc_bind.used &&
1332         (level + 1 < __kmp_nested_proc_bind.used)) {
1333       this_thr->th.th_current_task->td_icvs.proc_bind =
1334           __kmp_nested_proc_bind.bind_types[level + 1];
1335     }
1336 #endif /* OMP_40_ENABLED */
1337 
1338 #if USE_DEBUGGER
1339     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1340 #endif
1341     this_thr->th.th_info.ds.ds_tid = 0;
1342 
1343     /* set thread cache values */
1344     this_thr->th.th_team_nproc = 1;
1345     this_thr->th.th_team_master = this_thr;
1346     this_thr->th.th_team_serialized = 1;
1347 
1348     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1349     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1350 #if OMP_50_ENABLED
1351     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1352 #endif
1353 
1354     propagateFPControl(serial_team);
1355 
1356     /* check if we need to allocate dispatch buffers stack */
1357     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1358     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1359       serial_team->t.t_dispatch->th_disp_buffer =
1360           (dispatch_private_info_t *)__kmp_allocate(
1361               sizeof(dispatch_private_info_t));
1362     }
1363     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1364 
1365     KMP_MB();
1366 
1367   } else {
1368     /* this serialized team is already being used,
1369      * that's fine, just add another nested level */
1370     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1371     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1372     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1373     ++serial_team->t.t_serialized;
1374     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1375 
1376     // Nested level will be an index in the nested nthreads array
1377     int level = this_thr->th.th_team->t.t_level;
1378     // Thread value exists in the nested nthreads array for the next nested
1379     // level
1380     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1381       this_thr->th.th_current_task->td_icvs.nproc =
1382           __kmp_nested_nth.nth[level + 1];
1383     }
1384     serial_team->t.t_level++;
1385     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1386                   "of serial team %p to %d\n",
1387                   global_tid, serial_team, serial_team->t.t_level));
1388 
1389     /* allocate/push dispatch buffers stack */
1390     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1391     {
1392       dispatch_private_info_t *disp_buffer =
1393           (dispatch_private_info_t *)__kmp_allocate(
1394               sizeof(dispatch_private_info_t));
1395       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1396       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1397     }
1398     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1399 
1400     KMP_MB();
1401   }
1402 #if OMP_40_ENABLED
1403   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1404 #endif
1405 
1406 #if OMP_50_ENABLED
1407   // Perform the display affinity functionality for
1408   // serialized parallel regions
1409   if (__kmp_display_affinity) {
1410     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1411         this_thr->th.th_prev_num_threads != 1) {
1412       // NULL means use the affinity-format-var ICV
1413       __kmp_aux_display_affinity(global_tid, NULL);
1414       this_thr->th.th_prev_level = serial_team->t.t_level;
1415       this_thr->th.th_prev_num_threads = 1;
1416     }
1417   }
1418 #endif
1419 
1420   if (__kmp_env_consistency_check)
1421     __kmp_push_parallel(global_tid, NULL);
1422 #if OMPT_SUPPORT
1423   serial_team->t.ompt_team_info.master_return_address = codeptr;
1424   if (ompt_enabled.enabled &&
1425       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1426     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1427 
1428     ompt_lw_taskteam_t lw_taskteam;
1429     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1430                             &ompt_parallel_data, codeptr);
1431 
1432     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1433     // don't use lw_taskteam after linking. content was swaped
1434 
1435     /* OMPT implicit task begin */
1436     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1437     if (ompt_enabled.ompt_callback_implicit_task) {
1438       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1439           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1440           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1441       OMPT_CUR_TASK_INFO(this_thr)
1442           ->thread_num = __kmp_tid_from_gtid(global_tid);
1443     }
1444 
1445     /* OMPT state */
1446     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1447     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1448   }
1449 #endif
1450 }
1451 
1452 /* most of the work for a fork */
1453 /* return true if we really went parallel, false if serialized */
1454 int __kmp_fork_call(ident_t *loc, int gtid,
1455                     enum fork_context_e call_context, // Intel, GNU, ...
1456                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1457 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1458 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1459                     va_list *ap
1460 #else
1461                     va_list ap
1462 #endif
1463                     ) {
1464   void **argv;
1465   int i;
1466   int master_tid;
1467   int master_this_cons;
1468   kmp_team_t *team;
1469   kmp_team_t *parent_team;
1470   kmp_info_t *master_th;
1471   kmp_root_t *root;
1472   int nthreads;
1473   int master_active;
1474   int master_set_numthreads;
1475   int level;
1476 #if OMP_40_ENABLED
1477   int active_level;
1478   int teams_level;
1479 #endif
1480 #if KMP_NESTED_HOT_TEAMS
1481   kmp_hot_team_ptr_t **p_hot_teams;
1482 #endif
1483   { // KMP_TIME_BLOCK
1484     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1485     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1486 
1487     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1488     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1489       /* Some systems prefer the stack for the root thread(s) to start with */
1490       /* some gap from the parent stack to prevent false sharing. */
1491       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1492       /* These 2 lines below are so this does not get optimized out */
1493       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1494         __kmp_stkpadding += (short)((kmp_int64)dummy);
1495     }
1496 
1497     /* initialize if needed */
1498     KMP_DEBUG_ASSERT(
1499         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1500     if (!TCR_4(__kmp_init_parallel))
1501       __kmp_parallel_initialize();
1502 
1503 #if OMP_50_ENABLED
1504     __kmp_resume_if_soft_paused();
1505 #endif
1506 
1507     /* setup current data */
1508     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1509     // shutdown
1510     parent_team = master_th->th.th_team;
1511     master_tid = master_th->th.th_info.ds.ds_tid;
1512     master_this_cons = master_th->th.th_local.this_construct;
1513     root = master_th->th.th_root;
1514     master_active = root->r.r_active;
1515     master_set_numthreads = master_th->th.th_set_nproc;
1516 
1517 #if OMPT_SUPPORT
1518     ompt_data_t ompt_parallel_data = ompt_data_none;
1519     ompt_data_t *parent_task_data;
1520     ompt_frame_t *ompt_frame;
1521     ompt_data_t *implicit_task_data;
1522     void *return_address = NULL;
1523 
1524     if (ompt_enabled.enabled) {
1525       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1526                                     NULL, NULL);
1527       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1528     }
1529 #endif
1530 
1531     // Nested level will be an index in the nested nthreads array
1532     level = parent_team->t.t_level;
1533     // used to launch non-serial teams even if nested is not allowed
1534     active_level = parent_team->t.t_active_level;
1535 #if OMP_40_ENABLED
1536     // needed to check nesting inside the teams
1537     teams_level = master_th->th.th_teams_level;
1538 #endif
1539 #if KMP_NESTED_HOT_TEAMS
1540     p_hot_teams = &master_th->th.th_hot_teams;
1541     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1542       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1543           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1544       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1545       // it is either actual or not needed (when active_level > 0)
1546       (*p_hot_teams)[0].hot_team_nth = 1;
1547     }
1548 #endif
1549 
1550 #if OMPT_SUPPORT
1551     if (ompt_enabled.enabled) {
1552       if (ompt_enabled.ompt_callback_parallel_begin) {
1553         int team_size = master_set_numthreads
1554                             ? master_set_numthreads
1555                             : get__nproc_2(parent_team, master_tid);
1556         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1557             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1558             OMPT_INVOKER(call_context), return_address);
1559       }
1560       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1561     }
1562 #endif
1563 
1564     master_th->th.th_ident = loc;
1565 
1566 #if OMP_40_ENABLED
1567     if (master_th->th.th_teams_microtask && ap &&
1568         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1569       // AC: This is start of parallel that is nested inside teams construct.
1570       // The team is actual (hot), all workers are ready at the fork barrier.
1571       // No lock needed to initialize the team a bit, then free workers.
1572       parent_team->t.t_ident = loc;
1573       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1574       parent_team->t.t_argc = argc;
1575       argv = (void **)parent_team->t.t_argv;
1576       for (i = argc - 1; i >= 0; --i)
1577 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1578 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1579         *argv++ = va_arg(*ap, void *);
1580 #else
1581         *argv++ = va_arg(ap, void *);
1582 #endif
1583       // Increment our nested depth levels, but not increase the serialization
1584       if (parent_team == master_th->th.th_serial_team) {
1585         // AC: we are in serialized parallel
1586         __kmpc_serialized_parallel(loc, gtid);
1587         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1588         // AC: need this in order enquiry functions work
1589         // correctly, will restore at join time
1590         parent_team->t.t_serialized--;
1591 #if OMPT_SUPPORT
1592         void *dummy;
1593         void **exit_runtime_p;
1594 
1595         ompt_lw_taskteam_t lw_taskteam;
1596 
1597         if (ompt_enabled.enabled) {
1598           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1599                                   &ompt_parallel_data, return_address);
1600           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1601 
1602           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1603           // don't use lw_taskteam after linking. content was swaped
1604 
1605           /* OMPT implicit task begin */
1606           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1607           if (ompt_enabled.ompt_callback_implicit_task) {
1608             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1609                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1610                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1611             OMPT_CUR_TASK_INFO(master_th)
1612                 ->thread_num = __kmp_tid_from_gtid(gtid);
1613           }
1614 
1615           /* OMPT state */
1616           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1617         } else {
1618           exit_runtime_p = &dummy;
1619         }
1620 #endif
1621 
1622         {
1623           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1624           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1625           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1626 #if OMPT_SUPPORT
1627                                  ,
1628                                  exit_runtime_p
1629 #endif
1630                                  );
1631         }
1632 
1633 #if OMPT_SUPPORT
1634         *exit_runtime_p = NULL;
1635         if (ompt_enabled.enabled) {
1636           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1637           if (ompt_enabled.ompt_callback_implicit_task) {
1638             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1639                 ompt_scope_end, NULL, implicit_task_data, 1,
1640                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1641           }
1642           __ompt_lw_taskteam_unlink(master_th);
1643 
1644           if (ompt_enabled.ompt_callback_parallel_end) {
1645             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1646                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1647                 OMPT_INVOKER(call_context), return_address);
1648           }
1649           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1650         }
1651 #endif
1652         return TRUE;
1653       }
1654 
1655       parent_team->t.t_pkfn = microtask;
1656       parent_team->t.t_invoke = invoker;
1657       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1658       parent_team->t.t_active_level++;
1659       parent_team->t.t_level++;
1660 #if OMP_50_ENABLED
1661       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1662 #endif
1663 
1664       /* Change number of threads in the team if requested */
1665       if (master_set_numthreads) { // The parallel has num_threads clause
1666         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1667           // AC: only can reduce number of threads dynamically, can't increase
1668           kmp_info_t **other_threads = parent_team->t.t_threads;
1669           parent_team->t.t_nproc = master_set_numthreads;
1670           for (i = 0; i < master_set_numthreads; ++i) {
1671             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1672           }
1673           // Keep extra threads hot in the team for possible next parallels
1674         }
1675         master_th->th.th_set_nproc = 0;
1676       }
1677 
1678 #if USE_DEBUGGER
1679       if (__kmp_debugging) { // Let debugger override number of threads.
1680         int nth = __kmp_omp_num_threads(loc);
1681         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1682           master_set_numthreads = nth;
1683         }
1684       }
1685 #endif
1686 
1687       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1688                     "master_th=%p, gtid=%d\n",
1689                     root, parent_team, master_th, gtid));
1690       __kmp_internal_fork(loc, gtid, parent_team);
1691       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1692                     "master_th=%p, gtid=%d\n",
1693                     root, parent_team, master_th, gtid));
1694 
1695       /* Invoke microtask for MASTER thread */
1696       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1697                     parent_team->t.t_id, parent_team->t.t_pkfn));
1698 
1699       if (!parent_team->t.t_invoke(gtid)) {
1700         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1701       }
1702       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1703                     parent_team->t.t_id, parent_team->t.t_pkfn));
1704       KMP_MB(); /* Flush all pending memory write invalidates.  */
1705 
1706       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1707 
1708       return TRUE;
1709     } // Parallel closely nested in teams construct
1710 #endif /* OMP_40_ENABLED */
1711 
1712 #if KMP_DEBUG
1713     if (__kmp_tasking_mode != tskm_immediate_exec) {
1714       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1715                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1716     }
1717 #endif
1718 
1719     if (parent_team->t.t_active_level >=
1720         master_th->th.th_current_task->td_icvs.max_active_levels) {
1721       nthreads = 1;
1722     } else {
1723 #if OMP_40_ENABLED
1724       int enter_teams = ((ap == NULL && active_level == 0) ||
1725                          (ap && teams_level > 0 && teams_level == level));
1726 #endif
1727       nthreads =
1728           master_set_numthreads
1729               ? master_set_numthreads
1730               : get__nproc_2(
1731                     parent_team,
1732                     master_tid); // TODO: get nproc directly from current task
1733 
1734       // Check if we need to take forkjoin lock? (no need for serialized
1735       // parallel out of teams construct). This code moved here from
1736       // __kmp_reserve_threads() to speedup nested serialized parallels.
1737       if (nthreads > 1) {
1738         if ((!get__nested(master_th) && (root->r.r_in_parallel
1739 #if OMP_40_ENABLED
1740                                          && !enter_teams
1741 #endif /* OMP_40_ENABLED */
1742                                          )) ||
1743             (__kmp_library == library_serial)) {
1744           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1745                         " threads\n",
1746                         gtid, nthreads));
1747           nthreads = 1;
1748         }
1749       }
1750       if (nthreads > 1) {
1751         /* determine how many new threads we can use */
1752         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1753         nthreads = __kmp_reserve_threads(
1754             root, parent_team, master_tid, nthreads
1755 #if OMP_40_ENABLED
1756             /* AC: If we execute teams from parallel region (on host), then
1757                teams should be created but each can only have 1 thread if
1758                nesting is disabled. If teams called from serial region, then
1759                teams and their threads should be created regardless of the
1760                nesting setting. */
1761             ,
1762             enter_teams
1763 #endif /* OMP_40_ENABLED */
1764             );
1765         if (nthreads == 1) {
1766           // Free lock for single thread execution here; for multi-thread
1767           // execution it will be freed later after team of threads created
1768           // and initialized
1769           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1770         }
1771       }
1772     }
1773     KMP_DEBUG_ASSERT(nthreads > 0);
1774 
1775     // If we temporarily changed the set number of threads then restore it now
1776     master_th->th.th_set_nproc = 0;
1777 
1778     /* create a serialized parallel region? */
1779     if (nthreads == 1) {
1780 /* josh todo: hypothetical question: what do we do for OS X*? */
1781 #if KMP_OS_LINUX &&                                                            \
1782     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1783       void *args[argc];
1784 #else
1785       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1786 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1787           KMP_ARCH_AARCH64) */
1788 
1789       KA_TRACE(20,
1790                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1791 
1792       __kmpc_serialized_parallel(loc, gtid);
1793 
1794       if (call_context == fork_context_intel) {
1795         /* TODO this sucks, use the compiler itself to pass args! :) */
1796         master_th->th.th_serial_team->t.t_ident = loc;
1797 #if OMP_40_ENABLED
1798         if (!ap) {
1799           // revert change made in __kmpc_serialized_parallel()
1800           master_th->th.th_serial_team->t.t_level--;
1801 // Get args from parent team for teams construct
1802 
1803 #if OMPT_SUPPORT
1804           void *dummy;
1805           void **exit_runtime_p;
1806           ompt_task_info_t *task_info;
1807 
1808           ompt_lw_taskteam_t lw_taskteam;
1809 
1810           if (ompt_enabled.enabled) {
1811             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1812                                     &ompt_parallel_data, return_address);
1813 
1814             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1815             // don't use lw_taskteam after linking. content was swaped
1816 
1817             task_info = OMPT_CUR_TASK_INFO(master_th);
1818             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1819             if (ompt_enabled.ompt_callback_implicit_task) {
1820               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1821                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1822                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1823               OMPT_CUR_TASK_INFO(master_th)
1824                   ->thread_num = __kmp_tid_from_gtid(gtid);
1825             }
1826 
1827             /* OMPT state */
1828             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1829           } else {
1830             exit_runtime_p = &dummy;
1831           }
1832 #endif
1833 
1834           {
1835             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1836             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1837             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1838                                    parent_team->t.t_argv
1839 #if OMPT_SUPPORT
1840                                    ,
1841                                    exit_runtime_p
1842 #endif
1843                                    );
1844           }
1845 
1846 #if OMPT_SUPPORT
1847           if (ompt_enabled.enabled) {
1848             exit_runtime_p = NULL;
1849             if (ompt_enabled.ompt_callback_implicit_task) {
1850               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1851                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1852                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1853             }
1854 
1855             __ompt_lw_taskteam_unlink(master_th);
1856             if (ompt_enabled.ompt_callback_parallel_end) {
1857               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1858                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1859                   OMPT_INVOKER(call_context), return_address);
1860             }
1861             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1862           }
1863 #endif
1864         } else if (microtask == (microtask_t)__kmp_teams_master) {
1865           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1866                            master_th->th.th_serial_team);
1867           team = master_th->th.th_team;
1868           // team->t.t_pkfn = microtask;
1869           team->t.t_invoke = invoker;
1870           __kmp_alloc_argv_entries(argc, team, TRUE);
1871           team->t.t_argc = argc;
1872           argv = (void **)team->t.t_argv;
1873           if (ap) {
1874             for (i = argc - 1; i >= 0; --i)
1875 // TODO: revert workaround for Intel(R) 64 tracker #96
1876 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1877               *argv++ = va_arg(*ap, void *);
1878 #else
1879               *argv++ = va_arg(ap, void *);
1880 #endif
1881           } else {
1882             for (i = 0; i < argc; ++i)
1883               // Get args from parent team for teams construct
1884               argv[i] = parent_team->t.t_argv[i];
1885           }
1886           // AC: revert change made in __kmpc_serialized_parallel()
1887           //     because initial code in teams should have level=0
1888           team->t.t_level--;
1889           // AC: call special invoker for outer "parallel" of teams construct
1890           invoker(gtid);
1891         } else {
1892 #endif /* OMP_40_ENABLED */
1893           argv = args;
1894           for (i = argc - 1; i >= 0; --i)
1895 // TODO: revert workaround for Intel(R) 64 tracker #96
1896 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1897             *argv++ = va_arg(*ap, void *);
1898 #else
1899           *argv++ = va_arg(ap, void *);
1900 #endif
1901           KMP_MB();
1902 
1903 #if OMPT_SUPPORT
1904           void *dummy;
1905           void **exit_runtime_p;
1906           ompt_task_info_t *task_info;
1907 
1908           ompt_lw_taskteam_t lw_taskteam;
1909 
1910           if (ompt_enabled.enabled) {
1911             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1912                                     &ompt_parallel_data, return_address);
1913             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1914             // don't use lw_taskteam after linking. content was swaped
1915             task_info = OMPT_CUR_TASK_INFO(master_th);
1916             exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1917 
1918             /* OMPT implicit task begin */
1919             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1920             if (ompt_enabled.ompt_callback_implicit_task) {
1921               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1922                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1923                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1924               OMPT_CUR_TASK_INFO(master_th)
1925                   ->thread_num = __kmp_tid_from_gtid(gtid);
1926             }
1927 
1928             /* OMPT state */
1929             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1930           } else {
1931             exit_runtime_p = &dummy;
1932           }
1933 #endif
1934 
1935           {
1936             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1937             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1938             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1939 #if OMPT_SUPPORT
1940                                    ,
1941                                    exit_runtime_p
1942 #endif
1943                                    );
1944           }
1945 
1946 #if OMPT_SUPPORT
1947           if (ompt_enabled.enabled) {
1948             *exit_runtime_p = NULL;
1949             if (ompt_enabled.ompt_callback_implicit_task) {
1950               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1951                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1952                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1953             }
1954 
1955             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1956             __ompt_lw_taskteam_unlink(master_th);
1957             if (ompt_enabled.ompt_callback_parallel_end) {
1958               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1959                   &ompt_parallel_data, parent_task_data,
1960                   OMPT_INVOKER(call_context), return_address);
1961             }
1962             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1963           }
1964 #endif
1965 #if OMP_40_ENABLED
1966         }
1967 #endif /* OMP_40_ENABLED */
1968       } else if (call_context == fork_context_gnu) {
1969 #if OMPT_SUPPORT
1970         ompt_lw_taskteam_t lwt;
1971         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1972                                 return_address);
1973 
1974         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1975         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1976 // don't use lw_taskteam after linking. content was swaped
1977 #endif
1978 
1979         // we were called from GNU native code
1980         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1981         return FALSE;
1982       } else {
1983         KMP_ASSERT2(call_context < fork_context_last,
1984                     "__kmp_fork_call: unknown fork_context parameter");
1985       }
1986 
1987       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1988       KMP_MB();
1989       return FALSE;
1990     } // if (nthreads == 1)
1991 
1992     // GEH: only modify the executing flag in the case when not serialized
1993     //      serialized case is handled in kmpc_serialized_parallel
1994     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1995                   "curtask=%p, curtask_max_aclevel=%d\n",
1996                   parent_team->t.t_active_level, master_th,
1997                   master_th->th.th_current_task,
1998                   master_th->th.th_current_task->td_icvs.max_active_levels));
1999     // TODO: GEH - cannot do this assertion because root thread not set up as
2000     // executing
2001     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2002     master_th->th.th_current_task->td_flags.executing = 0;
2003 
2004 #if OMP_40_ENABLED
2005     if (!master_th->th.th_teams_microtask || level > teams_level)
2006 #endif /* OMP_40_ENABLED */
2007     {
2008       /* Increment our nested depth level */
2009       KMP_ATOMIC_INC(&root->r.r_in_parallel);
2010     }
2011 
2012     // See if we need to make a copy of the ICVs.
2013     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2014     if ((level + 1 < __kmp_nested_nth.used) &&
2015         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2016       nthreads_icv = __kmp_nested_nth.nth[level + 1];
2017     } else {
2018       nthreads_icv = 0; // don't update
2019     }
2020 
2021 #if OMP_40_ENABLED
2022     // Figure out the proc_bind_policy for the new team.
2023     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2024     kmp_proc_bind_t proc_bind_icv =
2025         proc_bind_default; // proc_bind_default means don't update
2026     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2027       proc_bind = proc_bind_false;
2028     } else {
2029       if (proc_bind == proc_bind_default) {
2030         // No proc_bind clause specified; use current proc-bind-var for this
2031         // parallel region
2032         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2033       }
2034       /* else: The proc_bind policy was specified explicitly on parallel clause.
2035          This overrides proc-bind-var for this parallel region, but does not
2036          change proc-bind-var. */
2037       // Figure the value of proc-bind-var for the child threads.
2038       if ((level + 1 < __kmp_nested_proc_bind.used) &&
2039           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2040            master_th->th.th_current_task->td_icvs.proc_bind)) {
2041         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2042       }
2043     }
2044 
2045     // Reset for next parallel region
2046     master_th->th.th_set_proc_bind = proc_bind_default;
2047 #endif /* OMP_40_ENABLED */
2048 
2049     if ((nthreads_icv > 0)
2050 #if OMP_40_ENABLED
2051         || (proc_bind_icv != proc_bind_default)
2052 #endif /* OMP_40_ENABLED */
2053             ) {
2054       kmp_internal_control_t new_icvs;
2055       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2056       new_icvs.next = NULL;
2057       if (nthreads_icv > 0) {
2058         new_icvs.nproc = nthreads_icv;
2059       }
2060 
2061 #if OMP_40_ENABLED
2062       if (proc_bind_icv != proc_bind_default) {
2063         new_icvs.proc_bind = proc_bind_icv;
2064       }
2065 #endif /* OMP_40_ENABLED */
2066 
2067       /* allocate a new parallel team */
2068       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2069       team = __kmp_allocate_team(root, nthreads, nthreads,
2070 #if OMPT_SUPPORT
2071                                  ompt_parallel_data,
2072 #endif
2073 #if OMP_40_ENABLED
2074                                  proc_bind,
2075 #endif
2076                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2077     } else {
2078       /* allocate a new parallel team */
2079       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2080       team = __kmp_allocate_team(root, nthreads, nthreads,
2081 #if OMPT_SUPPORT
2082                                  ompt_parallel_data,
2083 #endif
2084 #if OMP_40_ENABLED
2085                                  proc_bind,
2086 #endif
2087                                  &master_th->th.th_current_task->td_icvs,
2088                                  argc USE_NESTED_HOT_ARG(master_th));
2089     }
2090     KF_TRACE(
2091         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2092 
2093     /* setup the new team */
2094     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2095     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2096     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2097     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2098     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2099 #if OMPT_SUPPORT
2100     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2101                           return_address);
2102 #endif
2103     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2104 // TODO: parent_team->t.t_level == INT_MAX ???
2105 #if OMP_40_ENABLED
2106     if (!master_th->th.th_teams_microtask || level > teams_level) {
2107 #endif /* OMP_40_ENABLED */
2108       int new_level = parent_team->t.t_level + 1;
2109       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2110       new_level = parent_team->t.t_active_level + 1;
2111       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2112 #if OMP_40_ENABLED
2113     } else {
2114       // AC: Do not increase parallel level at start of the teams construct
2115       int new_level = parent_team->t.t_level;
2116       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2117       new_level = parent_team->t.t_active_level;
2118       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2119     }
2120 #endif /* OMP_40_ENABLED */
2121     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2122     // set master's schedule as new run-time schedule
2123     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2124 
2125 #if OMP_40_ENABLED
2126     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2127 #endif
2128 #if OMP_50_ENABLED
2129     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2130 #endif
2131 
2132     // Update the floating point rounding in the team if required.
2133     propagateFPControl(team);
2134 
2135     if (__kmp_tasking_mode != tskm_immediate_exec) {
2136       // Set master's task team to team's task team. Unless this is hot team, it
2137       // should be NULL.
2138       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2139                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2140       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2141                     "%p, new task_team %p / team %p\n",
2142                     __kmp_gtid_from_thread(master_th),
2143                     master_th->th.th_task_team, parent_team,
2144                     team->t.t_task_team[master_th->th.th_task_state], team));
2145 
2146       if (active_level || master_th->th.th_task_team) {
2147         // Take a memo of master's task_state
2148         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2149         if (master_th->th.th_task_state_top >=
2150             master_th->th.th_task_state_stack_sz) { // increase size
2151           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2152           kmp_uint8 *old_stack, *new_stack;
2153           kmp_uint32 i;
2154           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2155           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2156             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2157           }
2158           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2159                ++i) { // zero-init rest of stack
2160             new_stack[i] = 0;
2161           }
2162           old_stack = master_th->th.th_task_state_memo_stack;
2163           master_th->th.th_task_state_memo_stack = new_stack;
2164           master_th->th.th_task_state_stack_sz = new_size;
2165           __kmp_free(old_stack);
2166         }
2167         // Store master's task_state on stack
2168         master_th->th
2169             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2170             master_th->th.th_task_state;
2171         master_th->th.th_task_state_top++;
2172 #if KMP_NESTED_HOT_TEAMS
2173         if (master_th->th.th_hot_teams &&
2174             active_level < __kmp_hot_teams_max_level &&
2175             team == master_th->th.th_hot_teams[active_level].hot_team) {
2176           // Restore master's nested state if nested hot team
2177           master_th->th.th_task_state =
2178               master_th->th
2179                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2180         } else {
2181 #endif
2182           master_th->th.th_task_state = 0;
2183 #if KMP_NESTED_HOT_TEAMS
2184         }
2185 #endif
2186       }
2187 #if !KMP_NESTED_HOT_TEAMS
2188       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2189                        (team == root->r.r_hot_team));
2190 #endif
2191     }
2192 
2193     KA_TRACE(
2194         20,
2195         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2196          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2197          team->t.t_nproc));
2198     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2199                      (team->t.t_master_tid == 0 &&
2200                       (team->t.t_parent == root->r.r_root_team ||
2201                        team->t.t_parent->t.t_serialized)));
2202     KMP_MB();
2203 
2204     /* now, setup the arguments */
2205     argv = (void **)team->t.t_argv;
2206 #if OMP_40_ENABLED
2207     if (ap) {
2208 #endif /* OMP_40_ENABLED */
2209       for (i = argc - 1; i >= 0; --i) {
2210 // TODO: revert workaround for Intel(R) 64 tracker #96
2211 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2212         void *new_argv = va_arg(*ap, void *);
2213 #else
2214       void *new_argv = va_arg(ap, void *);
2215 #endif
2216         KMP_CHECK_UPDATE(*argv, new_argv);
2217         argv++;
2218       }
2219 #if OMP_40_ENABLED
2220     } else {
2221       for (i = 0; i < argc; ++i) {
2222         // Get args from parent team for teams construct
2223         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2224       }
2225     }
2226 #endif /* OMP_40_ENABLED */
2227 
2228     /* now actually fork the threads */
2229     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2230     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2231       root->r.r_active = TRUE;
2232 
2233     __kmp_fork_team_threads(root, team, master_th, gtid);
2234     __kmp_setup_icv_copy(team, nthreads,
2235                          &master_th->th.th_current_task->td_icvs, loc);
2236 
2237 #if OMPT_SUPPORT
2238     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2239 #endif
2240 
2241     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2242 
2243 #if USE_ITT_BUILD
2244     if (team->t.t_active_level == 1 // only report frames at level 1
2245 #if OMP_40_ENABLED
2246         && !master_th->th.th_teams_microtask // not in teams construct
2247 #endif /* OMP_40_ENABLED */
2248         ) {
2249 #if USE_ITT_NOTIFY
2250       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2251           (__kmp_forkjoin_frames_mode == 3 ||
2252            __kmp_forkjoin_frames_mode == 1)) {
2253         kmp_uint64 tmp_time = 0;
2254         if (__itt_get_timestamp_ptr)
2255           tmp_time = __itt_get_timestamp();
2256         // Internal fork - report frame begin
2257         master_th->th.th_frame_time = tmp_time;
2258         if (__kmp_forkjoin_frames_mode == 3)
2259           team->t.t_region_time = tmp_time;
2260       } else
2261 // only one notification scheme (either "submit" or "forking/joined", not both)
2262 #endif /* USE_ITT_NOTIFY */
2263           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2264               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2265         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2266         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2267       }
2268     }
2269 #endif /* USE_ITT_BUILD */
2270 
2271     /* now go on and do the work */
2272     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2273     KMP_MB();
2274     KF_TRACE(10,
2275              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2276               root, team, master_th, gtid));
2277 
2278 #if USE_ITT_BUILD
2279     if (__itt_stack_caller_create_ptr) {
2280       team->t.t_stack_id =
2281           __kmp_itt_stack_caller_create(); // create new stack stitching id
2282       // before entering fork barrier
2283     }
2284 #endif /* USE_ITT_BUILD */
2285 
2286 #if OMP_40_ENABLED
2287     // AC: skip __kmp_internal_fork at teams construct, let only master
2288     // threads execute
2289     if (ap)
2290 #endif /* OMP_40_ENABLED */
2291     {
2292       __kmp_internal_fork(loc, gtid, team);
2293       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2294                     "master_th=%p, gtid=%d\n",
2295                     root, team, master_th, gtid));
2296     }
2297 
2298     if (call_context == fork_context_gnu) {
2299       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300       return TRUE;
2301     }
2302 
2303     /* Invoke microtask for MASTER thread */
2304     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2305                   team->t.t_id, team->t.t_pkfn));
2306   } // END of timer KMP_fork_call block
2307 
2308   if (!team->t.t_invoke(gtid)) {
2309     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2310   }
2311   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2312                 team->t.t_id, team->t.t_pkfn));
2313   KMP_MB(); /* Flush all pending memory write invalidates.  */
2314 
2315   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2316 
2317 #if OMPT_SUPPORT
2318   if (ompt_enabled.enabled) {
2319     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2320   }
2321 #endif
2322 
2323   return TRUE;
2324 }
2325 
2326 #if OMPT_SUPPORT
2327 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2328                                             kmp_team_t *team) {
2329   // restore state outside the region
2330   thread->th.ompt_thread_info.state =
2331       ((team->t.t_serialized) ? ompt_state_work_serial
2332                               : ompt_state_work_parallel);
2333 }
2334 
2335 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2336                                    kmp_team_t *team, ompt_data_t *parallel_data,
2337                                    fork_context_e fork_context, void *codeptr) {
2338   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2339   if (ompt_enabled.ompt_callback_parallel_end) {
2340     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2341         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2342         codeptr);
2343   }
2344 
2345   task_info->frame.enter_frame = ompt_data_none;
2346   __kmp_join_restore_state(thread, team);
2347 }
2348 #endif
2349 
2350 void __kmp_join_call(ident_t *loc, int gtid
2351 #if OMPT_SUPPORT
2352                      ,
2353                      enum fork_context_e fork_context
2354 #endif
2355 #if OMP_40_ENABLED
2356                      ,
2357                      int exit_teams
2358 #endif /* OMP_40_ENABLED */
2359                      ) {
2360   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2361   kmp_team_t *team;
2362   kmp_team_t *parent_team;
2363   kmp_info_t *master_th;
2364   kmp_root_t *root;
2365   int master_active;
2366   int i;
2367 
2368   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2369 
2370   /* setup current data */
2371   master_th = __kmp_threads[gtid];
2372   root = master_th->th.th_root;
2373   team = master_th->th.th_team;
2374   parent_team = team->t.t_parent;
2375 
2376   master_th->th.th_ident = loc;
2377 
2378 #if OMPT_SUPPORT
2379   if (ompt_enabled.enabled) {
2380     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2381   }
2382 #endif
2383 
2384 #if KMP_DEBUG
2385   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2386     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2387                   "th_task_team = %p\n",
2388                   __kmp_gtid_from_thread(master_th), team,
2389                   team->t.t_task_team[master_th->th.th_task_state],
2390                   master_th->th.th_task_team));
2391     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2392                      team->t.t_task_team[master_th->th.th_task_state]);
2393   }
2394 #endif
2395 
2396   if (team->t.t_serialized) {
2397 #if OMP_40_ENABLED
2398     if (master_th->th.th_teams_microtask) {
2399       // We are in teams construct
2400       int level = team->t.t_level;
2401       int tlevel = master_th->th.th_teams_level;
2402       if (level == tlevel) {
2403         // AC: we haven't incremented it earlier at start of teams construct,
2404         //     so do it here - at the end of teams construct
2405         team->t.t_level++;
2406       } else if (level == tlevel + 1) {
2407         // AC: we are exiting parallel inside teams, need to increment
2408         // serialization in order to restore it in the next call to
2409         // __kmpc_end_serialized_parallel
2410         team->t.t_serialized++;
2411       }
2412     }
2413 #endif /* OMP_40_ENABLED */
2414     __kmpc_end_serialized_parallel(loc, gtid);
2415 
2416 #if OMPT_SUPPORT
2417     if (ompt_enabled.enabled) {
2418       __kmp_join_restore_state(master_th, parent_team);
2419     }
2420 #endif
2421 
2422     return;
2423   }
2424 
2425   master_active = team->t.t_master_active;
2426 
2427 #if OMP_40_ENABLED
2428   if (!exit_teams)
2429 #endif /* OMP_40_ENABLED */
2430   {
2431     // AC: No barrier for internal teams at exit from teams construct.
2432     //     But there is barrier for external team (league).
2433     __kmp_internal_join(loc, gtid, team);
2434   }
2435 #if OMP_40_ENABLED
2436   else {
2437     master_th->th.th_task_state =
2438         0; // AC: no tasking in teams (out of any parallel)
2439   }
2440 #endif /* OMP_40_ENABLED */
2441 
2442   KMP_MB();
2443 
2444 #if OMPT_SUPPORT
2445   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2446   void *codeptr = team->t.ompt_team_info.master_return_address;
2447 #endif
2448 
2449 #if USE_ITT_BUILD
2450   if (__itt_stack_caller_create_ptr) {
2451     __kmp_itt_stack_caller_destroy(
2452         (__itt_caller)team->t
2453             .t_stack_id); // destroy the stack stitching id after join barrier
2454   }
2455 
2456   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2457   if (team->t.t_active_level == 1
2458 #if OMP_40_ENABLED
2459       && !master_th->th.th_teams_microtask /* not in teams construct */
2460 #endif /* OMP_40_ENABLED */
2461       ) {
2462     master_th->th.th_ident = loc;
2463     // only one notification scheme (either "submit" or "forking/joined", not
2464     // both)
2465     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2466         __kmp_forkjoin_frames_mode == 3)
2467       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2468                              master_th->th.th_frame_time, 0, loc,
2469                              master_th->th.th_team_nproc, 1);
2470     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2471              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2472       __kmp_itt_region_joined(gtid);
2473   } // active_level == 1
2474 #endif /* USE_ITT_BUILD */
2475 
2476 #if OMP_40_ENABLED
2477   if (master_th->th.th_teams_microtask && !exit_teams &&
2478       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2479       team->t.t_level == master_th->th.th_teams_level + 1) {
2480     // AC: We need to leave the team structure intact at the end of parallel
2481     // inside the teams construct, so that at the next parallel same (hot) team
2482     // works, only adjust nesting levels
2483 
2484     /* Decrement our nested depth level */
2485     team->t.t_level--;
2486     team->t.t_active_level--;
2487     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2488 
2489     /* Restore number of threads in the team if needed */
2490     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2491       int old_num = master_th->th.th_team_nproc;
2492       int new_num = master_th->th.th_teams_size.nth;
2493       kmp_info_t **other_threads = team->t.t_threads;
2494       team->t.t_nproc = new_num;
2495       for (i = 0; i < old_num; ++i) {
2496         other_threads[i]->th.th_team_nproc = new_num;
2497       }
2498       // Adjust states of non-used threads of the team
2499       for (i = old_num; i < new_num; ++i) {
2500         // Re-initialize thread's barrier data.
2501         int b;
2502         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2503         for (b = 0; b < bs_last_barrier; ++b) {
2504           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2505           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2506 #if USE_DEBUGGER
2507           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2508 #endif
2509         }
2510         if (__kmp_tasking_mode != tskm_immediate_exec) {
2511           // Synchronize thread's task state
2512           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2513         }
2514       }
2515     }
2516 
2517 #if OMPT_SUPPORT
2518     if (ompt_enabled.enabled) {
2519       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2520                       codeptr);
2521     }
2522 #endif
2523 
2524     return;
2525   }
2526 #endif /* OMP_40_ENABLED */
2527 
2528   /* do cleanup and restore the parent team */
2529   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2530   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2531 
2532   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2533 
2534   /* jc: The following lock has instructions with REL and ACQ semantics,
2535      separating the parallel user code called in this parallel region
2536      from the serial user code called after this function returns. */
2537   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2538 
2539 #if OMP_40_ENABLED
2540   if (!master_th->th.th_teams_microtask ||
2541       team->t.t_level > master_th->th.th_teams_level)
2542 #endif /* OMP_40_ENABLED */
2543   {
2544     /* Decrement our nested depth level */
2545     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2546   }
2547   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2548 
2549 #if OMPT_SUPPORT
2550   if (ompt_enabled.enabled) {
2551     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2552     if (ompt_enabled.ompt_callback_implicit_task) {
2553       int ompt_team_size = team->t.t_nproc;
2554       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2555           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2556           OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2557     }
2558 
2559     task_info->frame.exit_frame = ompt_data_none;
2560     task_info->task_data = ompt_data_none;
2561   }
2562 #endif
2563 
2564   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2565                 master_th, team));
2566   __kmp_pop_current_task_from_thread(master_th);
2567 
2568 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2569   // Restore master thread's partition.
2570   master_th->th.th_first_place = team->t.t_first_place;
2571   master_th->th.th_last_place = team->t.t_last_place;
2572 #endif /* OMP_40_ENABLED */
2573 #if OMP_50_ENABLED
2574   master_th->th.th_def_allocator = team->t.t_def_allocator;
2575 #endif
2576 
2577   updateHWFPControl(team);
2578 
2579   if (root->r.r_active != master_active)
2580     root->r.r_active = master_active;
2581 
2582   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2583                             master_th)); // this will free worker threads
2584 
2585   /* this race was fun to find. make sure the following is in the critical
2586      region otherwise assertions may fail occasionally since the old team may be
2587      reallocated and the hierarchy appears inconsistent. it is actually safe to
2588      run and won't cause any bugs, but will cause those assertion failures. it's
2589      only one deref&assign so might as well put this in the critical region */
2590   master_th->th.th_team = parent_team;
2591   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2592   master_th->th.th_team_master = parent_team->t.t_threads[0];
2593   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2594 
2595   /* restore serialized team, if need be */
2596   if (parent_team->t.t_serialized &&
2597       parent_team != master_th->th.th_serial_team &&
2598       parent_team != root->r.r_root_team) {
2599     __kmp_free_team(root,
2600                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2601     master_th->th.th_serial_team = parent_team;
2602   }
2603 
2604   if (__kmp_tasking_mode != tskm_immediate_exec) {
2605     if (master_th->th.th_task_state_top >
2606         0) { // Restore task state from memo stack
2607       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2608       // Remember master's state if we re-use this nested hot team
2609       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2610           master_th->th.th_task_state;
2611       --master_th->th.th_task_state_top; // pop
2612       // Now restore state at this level
2613       master_th->th.th_task_state =
2614           master_th->th
2615               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2616     }
2617     // Copy the task team from the parent team to the master thread
2618     master_th->th.th_task_team =
2619         parent_team->t.t_task_team[master_th->th.th_task_state];
2620     KA_TRACE(20,
2621              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2622               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2623               parent_team));
2624   }
2625 
2626   // TODO: GEH - cannot do this assertion because root thread not set up as
2627   // executing
2628   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2629   master_th->th.th_current_task->td_flags.executing = 1;
2630 
2631   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2632 
2633 #if OMPT_SUPPORT
2634   if (ompt_enabled.enabled) {
2635     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2636                     codeptr);
2637   }
2638 #endif
2639 
2640   KMP_MB();
2641   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2642 }
2643 
2644 /* Check whether we should push an internal control record onto the
2645    serial team stack.  If so, do it.  */
2646 void __kmp_save_internal_controls(kmp_info_t *thread) {
2647 
2648   if (thread->th.th_team != thread->th.th_serial_team) {
2649     return;
2650   }
2651   if (thread->th.th_team->t.t_serialized > 1) {
2652     int push = 0;
2653 
2654     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2655       push = 1;
2656     } else {
2657       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2658           thread->th.th_team->t.t_serialized) {
2659         push = 1;
2660       }
2661     }
2662     if (push) { /* push a record on the serial team's stack */
2663       kmp_internal_control_t *control =
2664           (kmp_internal_control_t *)__kmp_allocate(
2665               sizeof(kmp_internal_control_t));
2666 
2667       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2668 
2669       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2670 
2671       control->next = thread->th.th_team->t.t_control_stack_top;
2672       thread->th.th_team->t.t_control_stack_top = control;
2673     }
2674   }
2675 }
2676 
2677 /* Changes set_nproc */
2678 void __kmp_set_num_threads(int new_nth, int gtid) {
2679   kmp_info_t *thread;
2680   kmp_root_t *root;
2681 
2682   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2683   KMP_DEBUG_ASSERT(__kmp_init_serial);
2684 
2685   if (new_nth < 1)
2686     new_nth = 1;
2687   else if (new_nth > __kmp_max_nth)
2688     new_nth = __kmp_max_nth;
2689 
2690   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2691   thread = __kmp_threads[gtid];
2692   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2693     return; // nothing to do
2694 
2695   __kmp_save_internal_controls(thread);
2696 
2697   set__nproc(thread, new_nth);
2698 
2699   // If this omp_set_num_threads() call will cause the hot team size to be
2700   // reduced (in the absence of a num_threads clause), then reduce it now,
2701   // rather than waiting for the next parallel region.
2702   root = thread->th.th_root;
2703   if (__kmp_init_parallel && (!root->r.r_active) &&
2704       (root->r.r_hot_team->t.t_nproc > new_nth)
2705 #if KMP_NESTED_HOT_TEAMS
2706       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2707 #endif
2708       ) {
2709     kmp_team_t *hot_team = root->r.r_hot_team;
2710     int f;
2711 
2712     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2713 
2714     // Release the extra threads we don't need any more.
2715     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2716       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2717       if (__kmp_tasking_mode != tskm_immediate_exec) {
2718         // When decreasing team size, threads no longer in the team should unref
2719         // task team.
2720         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2721       }
2722       __kmp_free_thread(hot_team->t.t_threads[f]);
2723       hot_team->t.t_threads[f] = NULL;
2724     }
2725     hot_team->t.t_nproc = new_nth;
2726 #if KMP_NESTED_HOT_TEAMS
2727     if (thread->th.th_hot_teams) {
2728       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2729       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2730     }
2731 #endif
2732 
2733     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2734 
2735     // Update the t_nproc field in the threads that are still active.
2736     for (f = 0; f < new_nth; f++) {
2737       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2738       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2739     }
2740     // Special flag in case omp_set_num_threads() call
2741     hot_team->t.t_size_changed = -1;
2742   }
2743 }
2744 
2745 /* Changes max_active_levels */
2746 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2747   kmp_info_t *thread;
2748 
2749   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2750                 "%d = (%d)\n",
2751                 gtid, max_active_levels));
2752   KMP_DEBUG_ASSERT(__kmp_init_serial);
2753 
2754   // validate max_active_levels
2755   if (max_active_levels < 0) {
2756     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2757     // We ignore this call if the user has specified a negative value.
2758     // The current setting won't be changed. The last valid setting will be
2759     // used. A warning will be issued (if warnings are allowed as controlled by
2760     // the KMP_WARNINGS env var).
2761     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2762                   "max_active_levels for thread %d = (%d)\n",
2763                   gtid, max_active_levels));
2764     return;
2765   }
2766   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2767     // it's OK, the max_active_levels is within the valid range: [ 0;
2768     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2769     // We allow a zero value. (implementation defined behavior)
2770   } else {
2771     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2772                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2773     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2774     // Current upper limit is MAX_INT. (implementation defined behavior)
2775     // If the input exceeds the upper limit, we correct the input to be the
2776     // upper limit. (implementation defined behavior)
2777     // Actually, the flow should never get here until we use MAX_INT limit.
2778   }
2779   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2780                 "max_active_levels for thread %d = (%d)\n",
2781                 gtid, max_active_levels));
2782 
2783   thread = __kmp_threads[gtid];
2784 
2785   __kmp_save_internal_controls(thread);
2786 
2787   set__max_active_levels(thread, max_active_levels);
2788 }
2789 
2790 /* Gets max_active_levels */
2791 int __kmp_get_max_active_levels(int gtid) {
2792   kmp_info_t *thread;
2793 
2794   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2795   KMP_DEBUG_ASSERT(__kmp_init_serial);
2796 
2797   thread = __kmp_threads[gtid];
2798   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2799   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2800                 "curtask_maxaclevel=%d\n",
2801                 gtid, thread->th.th_current_task,
2802                 thread->th.th_current_task->td_icvs.max_active_levels));
2803   return thread->th.th_current_task->td_icvs.max_active_levels;
2804 }
2805 
2806 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2807 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2808   kmp_info_t *thread;
2809   //    kmp_team_t *team;
2810 
2811   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2812                 gtid, (int)kind, chunk));
2813   KMP_DEBUG_ASSERT(__kmp_init_serial);
2814 
2815   // Check if the kind parameter is valid, correct if needed.
2816   // Valid parameters should fit in one of two intervals - standard or extended:
2817   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2818   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2819   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2820       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2821     // TODO: Hint needs attention in case we change the default schedule.
2822     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2823               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2824               __kmp_msg_null);
2825     kind = kmp_sched_default;
2826     chunk = 0; // ignore chunk value in case of bad kind
2827   }
2828 
2829   thread = __kmp_threads[gtid];
2830 
2831   __kmp_save_internal_controls(thread);
2832 
2833   if (kind < kmp_sched_upper_std) {
2834     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2835       // differ static chunked vs. unchunked:  chunk should be invalid to
2836       // indicate unchunked schedule (which is the default)
2837       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2838     } else {
2839       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2840           __kmp_sch_map[kind - kmp_sched_lower - 1];
2841     }
2842   } else {
2843     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2844     //    kmp_sched_lower - 2 ];
2845     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2846         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2847                       kmp_sched_lower - 2];
2848   }
2849   if (kind == kmp_sched_auto || chunk < 1) {
2850     // ignore parameter chunk for schedule auto
2851     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2852   } else {
2853     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2854   }
2855 }
2856 
2857 /* Gets def_sched_var ICV values */
2858 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2859   kmp_info_t *thread;
2860   enum sched_type th_type;
2861 
2862   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2863   KMP_DEBUG_ASSERT(__kmp_init_serial);
2864 
2865   thread = __kmp_threads[gtid];
2866 
2867   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2868 
2869   switch (th_type) {
2870   case kmp_sch_static:
2871   case kmp_sch_static_greedy:
2872   case kmp_sch_static_balanced:
2873     *kind = kmp_sched_static;
2874     *chunk = 0; // chunk was not set, try to show this fact via zero value
2875     return;
2876   case kmp_sch_static_chunked:
2877     *kind = kmp_sched_static;
2878     break;
2879   case kmp_sch_dynamic_chunked:
2880     *kind = kmp_sched_dynamic;
2881     break;
2882   case kmp_sch_guided_chunked:
2883   case kmp_sch_guided_iterative_chunked:
2884   case kmp_sch_guided_analytical_chunked:
2885     *kind = kmp_sched_guided;
2886     break;
2887   case kmp_sch_auto:
2888     *kind = kmp_sched_auto;
2889     break;
2890   case kmp_sch_trapezoidal:
2891     *kind = kmp_sched_trapezoidal;
2892     break;
2893 #if KMP_STATIC_STEAL_ENABLED
2894   case kmp_sch_static_steal:
2895     *kind = kmp_sched_static_steal;
2896     break;
2897 #endif
2898   default:
2899     KMP_FATAL(UnknownSchedulingType, th_type);
2900   }
2901 
2902   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2903 }
2904 
2905 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2906 
2907   int ii, dd;
2908   kmp_team_t *team;
2909   kmp_info_t *thr;
2910 
2911   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2912   KMP_DEBUG_ASSERT(__kmp_init_serial);
2913 
2914   // validate level
2915   if (level == 0)
2916     return 0;
2917   if (level < 0)
2918     return -1;
2919   thr = __kmp_threads[gtid];
2920   team = thr->th.th_team;
2921   ii = team->t.t_level;
2922   if (level > ii)
2923     return -1;
2924 
2925 #if OMP_40_ENABLED
2926   if (thr->th.th_teams_microtask) {
2927     // AC: we are in teams region where multiple nested teams have same level
2928     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2929     if (level <=
2930         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2931       KMP_DEBUG_ASSERT(ii >= tlevel);
2932       // AC: As we need to pass by the teams league, we need to artificially
2933       // increase ii
2934       if (ii == tlevel) {
2935         ii += 2; // three teams have same level
2936       } else {
2937         ii++; // two teams have same level
2938       }
2939     }
2940   }
2941 #endif
2942 
2943   if (ii == level)
2944     return __kmp_tid_from_gtid(gtid);
2945 
2946   dd = team->t.t_serialized;
2947   level++;
2948   while (ii > level) {
2949     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2950     }
2951     if ((team->t.t_serialized) && (!dd)) {
2952       team = team->t.t_parent;
2953       continue;
2954     }
2955     if (ii > level) {
2956       team = team->t.t_parent;
2957       dd = team->t.t_serialized;
2958       ii--;
2959     }
2960   }
2961 
2962   return (dd > 1) ? (0) : (team->t.t_master_tid);
2963 }
2964 
2965 int __kmp_get_team_size(int gtid, int level) {
2966 
2967   int ii, dd;
2968   kmp_team_t *team;
2969   kmp_info_t *thr;
2970 
2971   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2972   KMP_DEBUG_ASSERT(__kmp_init_serial);
2973 
2974   // validate level
2975   if (level == 0)
2976     return 1;
2977   if (level < 0)
2978     return -1;
2979   thr = __kmp_threads[gtid];
2980   team = thr->th.th_team;
2981   ii = team->t.t_level;
2982   if (level > ii)
2983     return -1;
2984 
2985 #if OMP_40_ENABLED
2986   if (thr->th.th_teams_microtask) {
2987     // AC: we are in teams region where multiple nested teams have same level
2988     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2989     if (level <=
2990         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2991       KMP_DEBUG_ASSERT(ii >= tlevel);
2992       // AC: As we need to pass by the teams league, we need to artificially
2993       // increase ii
2994       if (ii == tlevel) {
2995         ii += 2; // three teams have same level
2996       } else {
2997         ii++; // two teams have same level
2998       }
2999     }
3000   }
3001 #endif
3002 
3003   while (ii > level) {
3004     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3005     }
3006     if (team->t.t_serialized && (!dd)) {
3007       team = team->t.t_parent;
3008       continue;
3009     }
3010     if (ii > level) {
3011       team = team->t.t_parent;
3012       ii--;
3013     }
3014   }
3015 
3016   return team->t.t_nproc;
3017 }
3018 
3019 kmp_r_sched_t __kmp_get_schedule_global() {
3020   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3021   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3022   // independently. So one can get the updated schedule here.
3023 
3024   kmp_r_sched_t r_sched;
3025 
3026   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3027   // __kmp_guided. __kmp_sched should keep original value, so that user can set
3028   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3029   // different roots (even in OMP 2.5)
3030   if (__kmp_sched == kmp_sch_static) {
3031     // replace STATIC with more detailed schedule (balanced or greedy)
3032     r_sched.r_sched_type = __kmp_static;
3033   } else if (__kmp_sched == kmp_sch_guided_chunked) {
3034     // replace GUIDED with more detailed schedule (iterative or analytical)
3035     r_sched.r_sched_type = __kmp_guided;
3036   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3037     r_sched.r_sched_type = __kmp_sched;
3038   }
3039 
3040   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3041     // __kmp_chunk may be wrong here (if it was not ever set)
3042     r_sched.chunk = KMP_DEFAULT_CHUNK;
3043   } else {
3044     r_sched.chunk = __kmp_chunk;
3045   }
3046 
3047   return r_sched;
3048 }
3049 
3050 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3051    at least argc number of *t_argv entries for the requested team. */
3052 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3053 
3054   KMP_DEBUG_ASSERT(team);
3055   if (!realloc || argc > team->t.t_max_argc) {
3056 
3057     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3058                    "current entries=%d\n",
3059                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3060     /* if previously allocated heap space for args, free them */
3061     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3062       __kmp_free((void *)team->t.t_argv);
3063 
3064     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3065       /* use unused space in the cache line for arguments */
3066       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3067       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3068                      "argv entries\n",
3069                      team->t.t_id, team->t.t_max_argc));
3070       team->t.t_argv = &team->t.t_inline_argv[0];
3071       if (__kmp_storage_map) {
3072         __kmp_print_storage_map_gtid(
3073             -1, &team->t.t_inline_argv[0],
3074             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3075             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3076             team->t.t_id);
3077       }
3078     } else {
3079       /* allocate space for arguments in the heap */
3080       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3081                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3082                                : 2 * argc;
3083       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3084                      "argv entries\n",
3085                      team->t.t_id, team->t.t_max_argc));
3086       team->t.t_argv =
3087           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3088       if (__kmp_storage_map) {
3089         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3090                                      &team->t.t_argv[team->t.t_max_argc],
3091                                      sizeof(void *) * team->t.t_max_argc,
3092                                      "team_%d.t_argv", team->t.t_id);
3093       }
3094     }
3095   }
3096 }
3097 
3098 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3099   int i;
3100   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3101   team->t.t_threads =
3102       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3103   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3104       sizeof(dispatch_shared_info_t) * num_disp_buff);
3105   team->t.t_dispatch =
3106       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3107   team->t.t_implicit_task_taskdata =
3108       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3109   team->t.t_max_nproc = max_nth;
3110 
3111   /* setup dispatch buffers */
3112   for (i = 0; i < num_disp_buff; ++i) {
3113     team->t.t_disp_buffer[i].buffer_index = i;
3114 #if OMP_45_ENABLED
3115     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3116 #endif
3117   }
3118 }
3119 
3120 static void __kmp_free_team_arrays(kmp_team_t *team) {
3121   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3122   int i;
3123   for (i = 0; i < team->t.t_max_nproc; ++i) {
3124     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3125       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3126       team->t.t_dispatch[i].th_disp_buffer = NULL;
3127     }
3128   }
3129 #if KMP_USE_HIER_SCHED
3130   __kmp_dispatch_free_hierarchies(team);
3131 #endif
3132   __kmp_free(team->t.t_threads);
3133   __kmp_free(team->t.t_disp_buffer);
3134   __kmp_free(team->t.t_dispatch);
3135   __kmp_free(team->t.t_implicit_task_taskdata);
3136   team->t.t_threads = NULL;
3137   team->t.t_disp_buffer = NULL;
3138   team->t.t_dispatch = NULL;
3139   team->t.t_implicit_task_taskdata = 0;
3140 }
3141 
3142 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3143   kmp_info_t **oldThreads = team->t.t_threads;
3144 
3145   __kmp_free(team->t.t_disp_buffer);
3146   __kmp_free(team->t.t_dispatch);
3147   __kmp_free(team->t.t_implicit_task_taskdata);
3148   __kmp_allocate_team_arrays(team, max_nth);
3149 
3150   KMP_MEMCPY(team->t.t_threads, oldThreads,
3151              team->t.t_nproc * sizeof(kmp_info_t *));
3152 
3153   __kmp_free(oldThreads);
3154 }
3155 
3156 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3157 
3158   kmp_r_sched_t r_sched =
3159       __kmp_get_schedule_global(); // get current state of scheduling globals
3160 
3161 #if OMP_40_ENABLED
3162   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3163 #endif /* OMP_40_ENABLED */
3164 
3165   kmp_internal_control_t g_icvs = {
3166     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3167     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3168     // for nested parallelism (per thread)
3169     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3170     // adjustment of threads (per thread)
3171     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3172     // whether blocktime is explicitly set
3173     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3174 #if KMP_USE_MONITOR
3175     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3176 // intervals
3177 #endif
3178     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3179     // next parallel region (per thread)
3180     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3181     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3182     // for max_active_levels
3183     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3184 // {sched,chunk} pair
3185 #if OMP_40_ENABLED
3186     __kmp_nested_proc_bind.bind_types[0],
3187     __kmp_default_device,
3188 #endif /* OMP_40_ENABLED */
3189     NULL // struct kmp_internal_control *next;
3190   };
3191 
3192   return g_icvs;
3193 }
3194 
3195 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3196 
3197   kmp_internal_control_t gx_icvs;
3198   gx_icvs.serial_nesting_level =
3199       0; // probably =team->t.t_serial like in save_inter_controls
3200   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3201   gx_icvs.next = NULL;
3202 
3203   return gx_icvs;
3204 }
3205 
3206 static void __kmp_initialize_root(kmp_root_t *root) {
3207   int f;
3208   kmp_team_t *root_team;
3209   kmp_team_t *hot_team;
3210   int hot_team_max_nth;
3211   kmp_r_sched_t r_sched =
3212       __kmp_get_schedule_global(); // get current state of scheduling globals
3213   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3214   KMP_DEBUG_ASSERT(root);
3215   KMP_ASSERT(!root->r.r_begin);
3216 
3217   /* setup the root state structure */
3218   __kmp_init_lock(&root->r.r_begin_lock);
3219   root->r.r_begin = FALSE;
3220   root->r.r_active = FALSE;
3221   root->r.r_in_parallel = 0;
3222   root->r.r_blocktime = __kmp_dflt_blocktime;
3223   root->r.r_nested = __kmp_dflt_nested;
3224   root->r.r_cg_nthreads = 1;
3225 
3226   /* setup the root team for this task */
3227   /* allocate the root team structure */
3228   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3229 
3230   root_team =
3231       __kmp_allocate_team(root,
3232                           1, // new_nproc
3233                           1, // max_nproc
3234 #if OMPT_SUPPORT
3235                           ompt_data_none, // root parallel id
3236 #endif
3237 #if OMP_40_ENABLED
3238                           __kmp_nested_proc_bind.bind_types[0],
3239 #endif
3240                           &r_icvs,
3241                           0 // argc
3242                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3243                           );
3244 #if USE_DEBUGGER
3245   // Non-NULL value should be assigned to make the debugger display the root
3246   // team.
3247   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3248 #endif
3249 
3250   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3251 
3252   root->r.r_root_team = root_team;
3253   root_team->t.t_control_stack_top = NULL;
3254 
3255   /* initialize root team */
3256   root_team->t.t_threads[0] = NULL;
3257   root_team->t.t_nproc = 1;
3258   root_team->t.t_serialized = 1;
3259   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3260   root_team->t.t_sched.sched = r_sched.sched;
3261   KA_TRACE(
3262       20,
3263       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3264        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3265 
3266   /* setup the  hot team for this task */
3267   /* allocate the hot team structure */
3268   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3269 
3270   hot_team =
3271       __kmp_allocate_team(root,
3272                           1, // new_nproc
3273                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3274 #if OMPT_SUPPORT
3275                           ompt_data_none, // root parallel id
3276 #endif
3277 #if OMP_40_ENABLED
3278                           __kmp_nested_proc_bind.bind_types[0],
3279 #endif
3280                           &r_icvs,
3281                           0 // argc
3282                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3283                           );
3284   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3285 
3286   root->r.r_hot_team = hot_team;
3287   root_team->t.t_control_stack_top = NULL;
3288 
3289   /* first-time initialization */
3290   hot_team->t.t_parent = root_team;
3291 
3292   /* initialize hot team */
3293   hot_team_max_nth = hot_team->t.t_max_nproc;
3294   for (f = 0; f < hot_team_max_nth; ++f) {
3295     hot_team->t.t_threads[f] = NULL;
3296   }
3297   hot_team->t.t_nproc = 1;
3298   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3299   hot_team->t.t_sched.sched = r_sched.sched;
3300   hot_team->t.t_size_changed = 0;
3301 }
3302 
3303 #ifdef KMP_DEBUG
3304 
3305 typedef struct kmp_team_list_item {
3306   kmp_team_p const *entry;
3307   struct kmp_team_list_item *next;
3308 } kmp_team_list_item_t;
3309 typedef kmp_team_list_item_t *kmp_team_list_t;
3310 
3311 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3312     kmp_team_list_t list, // List of teams.
3313     kmp_team_p const *team // Team to add.
3314     ) {
3315 
3316   // List must terminate with item where both entry and next are NULL.
3317   // Team is added to the list only once.
3318   // List is sorted in ascending order by team id.
3319   // Team id is *not* a key.
3320 
3321   kmp_team_list_t l;
3322 
3323   KMP_DEBUG_ASSERT(list != NULL);
3324   if (team == NULL) {
3325     return;
3326   }
3327 
3328   __kmp_print_structure_team_accum(list, team->t.t_parent);
3329   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3330 
3331   // Search list for the team.
3332   l = list;
3333   while (l->next != NULL && l->entry != team) {
3334     l = l->next;
3335   }
3336   if (l->next != NULL) {
3337     return; // Team has been added before, exit.
3338   }
3339 
3340   // Team is not found. Search list again for insertion point.
3341   l = list;
3342   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3343     l = l->next;
3344   }
3345 
3346   // Insert team.
3347   {
3348     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3349         sizeof(kmp_team_list_item_t));
3350     *item = *l;
3351     l->entry = team;
3352     l->next = item;
3353   }
3354 }
3355 
3356 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3357 
3358                                        ) {
3359   __kmp_printf("%s", title);
3360   if (team != NULL) {
3361     __kmp_printf("%2x %p\n", team->t.t_id, team);
3362   } else {
3363     __kmp_printf(" - (nil)\n");
3364   }
3365 }
3366 
3367 static void __kmp_print_structure_thread(char const *title,
3368                                          kmp_info_p const *thread) {
3369   __kmp_printf("%s", title);
3370   if (thread != NULL) {
3371     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3372   } else {
3373     __kmp_printf(" - (nil)\n");
3374   }
3375 }
3376 
3377 void __kmp_print_structure(void) {
3378 
3379   kmp_team_list_t list;
3380 
3381   // Initialize list of teams.
3382   list =
3383       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3384   list->entry = NULL;
3385   list->next = NULL;
3386 
3387   __kmp_printf("\n------------------------------\nGlobal Thread "
3388                "Table\n------------------------------\n");
3389   {
3390     int gtid;
3391     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3392       __kmp_printf("%2d", gtid);
3393       if (__kmp_threads != NULL) {
3394         __kmp_printf(" %p", __kmp_threads[gtid]);
3395       }
3396       if (__kmp_root != NULL) {
3397         __kmp_printf(" %p", __kmp_root[gtid]);
3398       }
3399       __kmp_printf("\n");
3400     }
3401   }
3402 
3403   // Print out __kmp_threads array.
3404   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3405                "----------\n");
3406   if (__kmp_threads != NULL) {
3407     int gtid;
3408     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3409       kmp_info_t const *thread = __kmp_threads[gtid];
3410       if (thread != NULL) {
3411         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3412         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3413         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3414         __kmp_print_structure_team("    Serial Team:  ",
3415                                    thread->th.th_serial_team);
3416         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3417         __kmp_print_structure_thread("    Master:       ",
3418                                      thread->th.th_team_master);
3419         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3420         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3421 #if OMP_40_ENABLED
3422         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3423 #endif
3424         __kmp_print_structure_thread("    Next in pool: ",
3425                                      thread->th.th_next_pool);
3426         __kmp_printf("\n");
3427         __kmp_print_structure_team_accum(list, thread->th.th_team);
3428         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3429       }
3430     }
3431   } else {
3432     __kmp_printf("Threads array is not allocated.\n");
3433   }
3434 
3435   // Print out __kmp_root array.
3436   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3437                "--------\n");
3438   if (__kmp_root != NULL) {
3439     int gtid;
3440     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3441       kmp_root_t const *root = __kmp_root[gtid];
3442       if (root != NULL) {
3443         __kmp_printf("GTID %2d %p:\n", gtid, root);
3444         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3445         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3446         __kmp_print_structure_thread("    Uber Thread:  ",
3447                                      root->r.r_uber_thread);
3448         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3449         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3450         __kmp_printf("    In Parallel:  %2d\n",
3451                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3452         __kmp_printf("\n");
3453         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3454         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3455       }
3456     }
3457   } else {
3458     __kmp_printf("Ubers array is not allocated.\n");
3459   }
3460 
3461   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3462                "--------\n");
3463   while (list->next != NULL) {
3464     kmp_team_p const *team = list->entry;
3465     int i;
3466     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3467     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3468     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3469     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3470     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3471     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3472     for (i = 0; i < team->t.t_nproc; ++i) {
3473       __kmp_printf("    Thread %2d:      ", i);
3474       __kmp_print_structure_thread("", team->t.t_threads[i]);
3475     }
3476     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3477     __kmp_printf("\n");
3478     list = list->next;
3479   }
3480 
3481   // Print out __kmp_thread_pool and __kmp_team_pool.
3482   __kmp_printf("\n------------------------------\nPools\n----------------------"
3483                "--------\n");
3484   __kmp_print_structure_thread("Thread pool:          ",
3485                                CCAST(kmp_info_t *, __kmp_thread_pool));
3486   __kmp_print_structure_team("Team pool:            ",
3487                              CCAST(kmp_team_t *, __kmp_team_pool));
3488   __kmp_printf("\n");
3489 
3490   // Free team list.
3491   while (list != NULL) {
3492     kmp_team_list_item_t *item = list;
3493     list = list->next;
3494     KMP_INTERNAL_FREE(item);
3495   }
3496 }
3497 
3498 #endif
3499 
3500 //---------------------------------------------------------------------------
3501 //  Stuff for per-thread fast random number generator
3502 //  Table of primes
3503 static const unsigned __kmp_primes[] = {
3504     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3505     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3506     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3507     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3508     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3509     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3510     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3511     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3512     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3513     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3514     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3515 
3516 //---------------------------------------------------------------------------
3517 //  __kmp_get_random: Get a random number using a linear congruential method.
3518 unsigned short __kmp_get_random(kmp_info_t *thread) {
3519   unsigned x = thread->th.th_x;
3520   unsigned short r = x >> 16;
3521 
3522   thread->th.th_x = x * thread->th.th_a + 1;
3523 
3524   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3525                 thread->th.th_info.ds.ds_tid, r));
3526 
3527   return r;
3528 }
3529 //--------------------------------------------------------
3530 // __kmp_init_random: Initialize a random number generator
3531 void __kmp_init_random(kmp_info_t *thread) {
3532   unsigned seed = thread->th.th_info.ds.ds_tid;
3533 
3534   thread->th.th_a =
3535       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3536   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3537   KA_TRACE(30,
3538            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3539 }
3540 
3541 #if KMP_OS_WINDOWS
3542 /* reclaim array entries for root threads that are already dead, returns number
3543  * reclaimed */
3544 static int __kmp_reclaim_dead_roots(void) {
3545   int i, r = 0;
3546 
3547   for (i = 0; i < __kmp_threads_capacity; ++i) {
3548     if (KMP_UBER_GTID(i) &&
3549         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3550         !__kmp_root[i]
3551              ->r.r_active) { // AC: reclaim only roots died in non-active state
3552       r += __kmp_unregister_root_other_thread(i);
3553     }
3554   }
3555   return r;
3556 }
3557 #endif
3558 
3559 /* This function attempts to create free entries in __kmp_threads and
3560    __kmp_root, and returns the number of free entries generated.
3561 
3562    For Windows* OS static library, the first mechanism used is to reclaim array
3563    entries for root threads that are already dead.
3564 
3565    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3566    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3567    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3568    threadprivate cache array has been created. Synchronization with
3569    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3570 
3571    After any dead root reclamation, if the clipping value allows array expansion
3572    to result in the generation of a total of nNeed free slots, the function does
3573    that expansion. If not, nothing is done beyond the possible initial root
3574    thread reclamation.
3575 
3576    If any argument is negative, the behavior is undefined. */
3577 static int __kmp_expand_threads(int nNeed) {
3578   int added = 0;
3579   int minimumRequiredCapacity;
3580   int newCapacity;
3581   kmp_info_t **newThreads;
3582   kmp_root_t **newRoot;
3583 
3584 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3585 // resizing __kmp_threads does not need additional protection if foreign
3586 // threads are present
3587 
3588 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3589   /* only for Windows static library */
3590   /* reclaim array entries for root threads that are already dead */
3591   added = __kmp_reclaim_dead_roots();
3592 
3593   if (nNeed) {
3594     nNeed -= added;
3595     if (nNeed < 0)
3596       nNeed = 0;
3597   }
3598 #endif
3599   if (nNeed <= 0)
3600     return added;
3601 
3602   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3603   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3604   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3605   // > __kmp_max_nth in one of two ways:
3606   //
3607   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3608   //    may not be resused by another thread, so we may need to increase
3609   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3610   //
3611   // 2) New foreign root(s) are encountered.  We always register new foreign
3612   //    roots. This may cause a smaller # of threads to be allocated at
3613   //    subsequent parallel regions, but the worker threads hang around (and
3614   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3615   //
3616   // Anyway, that is the reason for moving the check to see if
3617   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3618   // instead of having it performed here. -BB
3619 
3620   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3621 
3622   /* compute expansion headroom to check if we can expand */
3623   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3624     /* possible expansion too small -- give up */
3625     return added;
3626   }
3627   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3628 
3629   newCapacity = __kmp_threads_capacity;
3630   do {
3631     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3632                                                           : __kmp_sys_max_nth;
3633   } while (newCapacity < minimumRequiredCapacity);
3634   newThreads = (kmp_info_t **)__kmp_allocate(
3635       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3636   newRoot =
3637       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3638   KMP_MEMCPY(newThreads, __kmp_threads,
3639              __kmp_threads_capacity * sizeof(kmp_info_t *));
3640   KMP_MEMCPY(newRoot, __kmp_root,
3641              __kmp_threads_capacity * sizeof(kmp_root_t *));
3642 
3643   kmp_info_t **temp_threads = __kmp_threads;
3644   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3645   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3646   __kmp_free(temp_threads);
3647   added += newCapacity - __kmp_threads_capacity;
3648   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3649 
3650   if (newCapacity > __kmp_tp_capacity) {
3651     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3652     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3653       __kmp_threadprivate_resize_cache(newCapacity);
3654     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3655       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3656     }
3657     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3658   }
3659 
3660   return added;
3661 }
3662 
3663 /* Register the current thread as a root thread and obtain our gtid. We must
3664    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3665    thread that calls from __kmp_do_serial_initialize() */
3666 int __kmp_register_root(int initial_thread) {
3667   kmp_info_t *root_thread;
3668   kmp_root_t *root;
3669   int gtid;
3670   int capacity;
3671   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3672   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3673   KMP_MB();
3674 
3675   /* 2007-03-02:
3676      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3677      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3678      work as expected -- it may return false (that means there is at least one
3679      empty slot in __kmp_threads array), but it is possible the only free slot
3680      is #0, which is reserved for initial thread and so cannot be used for this
3681      one. Following code workarounds this bug.
3682 
3683      However, right solution seems to be not reserving slot #0 for initial
3684      thread because:
3685      (1) there is no magic in slot #0,
3686      (2) we cannot detect initial thread reliably (the first thread which does
3687         serial initialization may be not a real initial thread).
3688   */
3689   capacity = __kmp_threads_capacity;
3690   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3691     --capacity;
3692   }
3693 
3694   /* see if there are too many threads */
3695   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3696     if (__kmp_tp_cached) {
3697       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3698                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3699                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3700     } else {
3701       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3702                   __kmp_msg_null);
3703     }
3704   }
3705 
3706   /* find an available thread slot */
3707   /* Don't reassign the zero slot since we need that to only be used by initial
3708      thread */
3709   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3710        gtid++)
3711     ;
3712   KA_TRACE(1,
3713            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3714   KMP_ASSERT(gtid < __kmp_threads_capacity);
3715 
3716   /* update global accounting */
3717   __kmp_all_nth++;
3718   TCW_4(__kmp_nth, __kmp_nth + 1);
3719 
3720   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3721   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3722   if (__kmp_adjust_gtid_mode) {
3723     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3724       if (TCR_4(__kmp_gtid_mode) != 2) {
3725         TCW_4(__kmp_gtid_mode, 2);
3726       }
3727     } else {
3728       if (TCR_4(__kmp_gtid_mode) != 1) {
3729         TCW_4(__kmp_gtid_mode, 1);
3730       }
3731     }
3732   }
3733 
3734 #ifdef KMP_ADJUST_BLOCKTIME
3735   /* Adjust blocktime to zero if necessary            */
3736   /* Middle initialization might not have occurred yet */
3737   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3738     if (__kmp_nth > __kmp_avail_proc) {
3739       __kmp_zero_bt = TRUE;
3740     }
3741   }
3742 #endif /* KMP_ADJUST_BLOCKTIME */
3743 
3744   /* setup this new hierarchy */
3745   if (!(root = __kmp_root[gtid])) {
3746     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3747     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3748   }
3749 
3750 #if KMP_STATS_ENABLED
3751   // Initialize stats as soon as possible (right after gtid assignment).
3752   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3753   __kmp_stats_thread_ptr->startLife();
3754   KMP_SET_THREAD_STATE(SERIAL_REGION);
3755   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3756 #endif
3757   __kmp_initialize_root(root);
3758 
3759   /* setup new root thread structure */
3760   if (root->r.r_uber_thread) {
3761     root_thread = root->r.r_uber_thread;
3762   } else {
3763     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3764     if (__kmp_storage_map) {
3765       __kmp_print_thread_storage_map(root_thread, gtid);
3766     }
3767     root_thread->th.th_info.ds.ds_gtid = gtid;
3768 #if OMPT_SUPPORT
3769     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3770 #endif
3771     root_thread->th.th_root = root;
3772     if (__kmp_env_consistency_check) {
3773       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3774     }
3775 #if USE_FAST_MEMORY
3776     __kmp_initialize_fast_memory(root_thread);
3777 #endif /* USE_FAST_MEMORY */
3778 
3779 #if KMP_USE_BGET
3780     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3781     __kmp_initialize_bget(root_thread);
3782 #endif
3783     __kmp_init_random(root_thread); // Initialize random number generator
3784   }
3785 
3786   /* setup the serial team held in reserve by the root thread */
3787   if (!root_thread->th.th_serial_team) {
3788     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3789     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3790     root_thread->th.th_serial_team =
3791         __kmp_allocate_team(root, 1, 1,
3792 #if OMPT_SUPPORT
3793                             ompt_data_none, // root parallel id
3794 #endif
3795 #if OMP_40_ENABLED
3796                             proc_bind_default,
3797 #endif
3798                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3799   }
3800   KMP_ASSERT(root_thread->th.th_serial_team);
3801   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3802                 root_thread->th.th_serial_team));
3803 
3804   /* drop root_thread into place */
3805   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3806 
3807   root->r.r_root_team->t.t_threads[0] = root_thread;
3808   root->r.r_hot_team->t.t_threads[0] = root_thread;
3809   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3810   // AC: the team created in reserve, not for execution (it is unused for now).
3811   root_thread->th.th_serial_team->t.t_serialized = 0;
3812   root->r.r_uber_thread = root_thread;
3813 
3814   /* initialize the thread, get it ready to go */
3815   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3816   TCW_4(__kmp_init_gtid, TRUE);
3817 
3818   /* prepare the master thread for get_gtid() */
3819   __kmp_gtid_set_specific(gtid);
3820 
3821 #if USE_ITT_BUILD
3822   __kmp_itt_thread_name(gtid);
3823 #endif /* USE_ITT_BUILD */
3824 
3825 #ifdef KMP_TDATA_GTID
3826   __kmp_gtid = gtid;
3827 #endif
3828   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3829   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3830 
3831   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3832                 "plain=%u\n",
3833                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3834                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3835                 KMP_INIT_BARRIER_STATE));
3836   { // Initialize barrier data.
3837     int b;
3838     for (b = 0; b < bs_last_barrier; ++b) {
3839       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3840 #if USE_DEBUGGER
3841       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3842 #endif
3843     }
3844   }
3845   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3846                    KMP_INIT_BARRIER_STATE);
3847 
3848 #if KMP_AFFINITY_SUPPORTED
3849 #if OMP_40_ENABLED
3850   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3851   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3852   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3853   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3854 #endif
3855   if (TCR_4(__kmp_init_middle)) {
3856     __kmp_affinity_set_init_mask(gtid, TRUE);
3857   }
3858 #endif /* KMP_AFFINITY_SUPPORTED */
3859 #if OMP_50_ENABLED
3860   root_thread->th.th_def_allocator = __kmp_def_allocator;
3861   root_thread->th.th_prev_level = 0;
3862   root_thread->th.th_prev_num_threads = 1;
3863 #endif
3864 
3865   __kmp_root_counter++;
3866 
3867 #if OMPT_SUPPORT
3868   if (!initial_thread && ompt_enabled.enabled) {
3869 
3870     kmp_info_t *root_thread = ompt_get_thread();
3871 
3872     ompt_set_thread_state(root_thread, ompt_state_overhead);
3873 
3874     if (ompt_enabled.ompt_callback_thread_begin) {
3875       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3876           ompt_thread_initial, __ompt_get_thread_data_internal());
3877     }
3878     ompt_data_t *task_data;
3879     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3880     if (ompt_enabled.ompt_callback_task_create) {
3881       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3882           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3883       // initial task has nothing to return to
3884     }
3885 
3886     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3887   }
3888 #endif
3889 
3890   KMP_MB();
3891   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3892 
3893   return gtid;
3894 }
3895 
3896 #if KMP_NESTED_HOT_TEAMS
3897 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3898                                 const int max_level) {
3899   int i, n, nth;
3900   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3901   if (!hot_teams || !hot_teams[level].hot_team) {
3902     return 0;
3903   }
3904   KMP_DEBUG_ASSERT(level < max_level);
3905   kmp_team_t *team = hot_teams[level].hot_team;
3906   nth = hot_teams[level].hot_team_nth;
3907   n = nth - 1; // master is not freed
3908   if (level < max_level - 1) {
3909     for (i = 0; i < nth; ++i) {
3910       kmp_info_t *th = team->t.t_threads[i];
3911       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3912       if (i > 0 && th->th.th_hot_teams) {
3913         __kmp_free(th->th.th_hot_teams);
3914         th->th.th_hot_teams = NULL;
3915       }
3916     }
3917   }
3918   __kmp_free_team(root, team, NULL);
3919   return n;
3920 }
3921 #endif
3922 
3923 // Resets a root thread and clear its root and hot teams.
3924 // Returns the number of __kmp_threads entries directly and indirectly freed.
3925 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3926   kmp_team_t *root_team = root->r.r_root_team;
3927   kmp_team_t *hot_team = root->r.r_hot_team;
3928   int n = hot_team->t.t_nproc;
3929   int i;
3930 
3931   KMP_DEBUG_ASSERT(!root->r.r_active);
3932 
3933   root->r.r_root_team = NULL;
3934   root->r.r_hot_team = NULL;
3935   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3936   // before call to __kmp_free_team().
3937   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3938 #if KMP_NESTED_HOT_TEAMS
3939   if (__kmp_hot_teams_max_level >
3940       0) { // need to free nested hot teams and their threads if any
3941     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3942       kmp_info_t *th = hot_team->t.t_threads[i];
3943       if (__kmp_hot_teams_max_level > 1) {
3944         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3945       }
3946       if (th->th.th_hot_teams) {
3947         __kmp_free(th->th.th_hot_teams);
3948         th->th.th_hot_teams = NULL;
3949       }
3950     }
3951   }
3952 #endif
3953   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3954 
3955   // Before we can reap the thread, we need to make certain that all other
3956   // threads in the teams that had this root as ancestor have stopped trying to
3957   // steal tasks.
3958   if (__kmp_tasking_mode != tskm_immediate_exec) {
3959     __kmp_wait_to_unref_task_teams();
3960   }
3961 
3962 #if KMP_OS_WINDOWS
3963   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3964   KA_TRACE(
3965       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3966            "\n",
3967            (LPVOID) & (root->r.r_uber_thread->th),
3968            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3969   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3970 #endif /* KMP_OS_WINDOWS */
3971 
3972 #if OMPT_SUPPORT
3973   if (ompt_enabled.ompt_callback_thread_end) {
3974     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3975         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3976   }
3977 #endif
3978 
3979   TCW_4(__kmp_nth,
3980         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3981   root->r.r_cg_nthreads--;
3982 
3983   __kmp_reap_thread(root->r.r_uber_thread, 1);
3984 
3985   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3986   // of freeing.
3987   root->r.r_uber_thread = NULL;
3988   /* mark root as no longer in use */
3989   root->r.r_begin = FALSE;
3990 
3991   return n;
3992 }
3993 
3994 void __kmp_unregister_root_current_thread(int gtid) {
3995   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3996   /* this lock should be ok, since unregister_root_current_thread is never
3997      called during an abort, only during a normal close. furthermore, if you
3998      have the forkjoin lock, you should never try to get the initz lock */
3999   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4000   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4001     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4002                   "exiting T#%d\n",
4003                   gtid));
4004     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4005     return;
4006   }
4007   kmp_root_t *root = __kmp_root[gtid];
4008 
4009   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4010   KMP_ASSERT(KMP_UBER_GTID(gtid));
4011   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4012   KMP_ASSERT(root->r.r_active == FALSE);
4013 
4014   KMP_MB();
4015 
4016 #if OMP_45_ENABLED
4017   kmp_info_t *thread = __kmp_threads[gtid];
4018   kmp_team_t *team = thread->th.th_team;
4019   kmp_task_team_t *task_team = thread->th.th_task_team;
4020 
4021   // we need to wait for the proxy tasks before finishing the thread
4022   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4023 #if OMPT_SUPPORT
4024     // the runtime is shutting down so we won't report any events
4025     thread->th.ompt_thread_info.state = ompt_state_undefined;
4026 #endif
4027     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4028   }
4029 #endif
4030 
4031   __kmp_reset_root(gtid, root);
4032 
4033   /* free up this thread slot */
4034   __kmp_gtid_set_specific(KMP_GTID_DNE);
4035 #ifdef KMP_TDATA_GTID
4036   __kmp_gtid = KMP_GTID_DNE;
4037 #endif
4038 
4039   KMP_MB();
4040   KC_TRACE(10,
4041            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4042 
4043   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4044 }
4045 
4046 #if KMP_OS_WINDOWS
4047 /* __kmp_forkjoin_lock must be already held
4048    Unregisters a root thread that is not the current thread.  Returns the number
4049    of __kmp_threads entries freed as a result. */
4050 static int __kmp_unregister_root_other_thread(int gtid) {
4051   kmp_root_t *root = __kmp_root[gtid];
4052   int r;
4053 
4054   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4055   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4056   KMP_ASSERT(KMP_UBER_GTID(gtid));
4057   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4058   KMP_ASSERT(root->r.r_active == FALSE);
4059 
4060   r = __kmp_reset_root(gtid, root);
4061   KC_TRACE(10,
4062            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4063   return r;
4064 }
4065 #endif
4066 
4067 #if KMP_DEBUG
4068 void __kmp_task_info() {
4069 
4070   kmp_int32 gtid = __kmp_entry_gtid();
4071   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4072   kmp_info_t *this_thr = __kmp_threads[gtid];
4073   kmp_team_t *steam = this_thr->th.th_serial_team;
4074   kmp_team_t *team = this_thr->th.th_team;
4075 
4076   __kmp_printf(
4077       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4078       "ptask=%p\n",
4079       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4080       team->t.t_implicit_task_taskdata[tid].td_parent);
4081 }
4082 #endif // KMP_DEBUG
4083 
4084 /* TODO optimize with one big memclr, take out what isn't needed, split
4085    responsibility to workers as much as possible, and delay initialization of
4086    features as much as possible  */
4087 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4088                                   int tid, int gtid) {
4089   /* this_thr->th.th_info.ds.ds_gtid is setup in
4090      kmp_allocate_thread/create_worker.
4091      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4092   kmp_info_t *master = team->t.t_threads[0];
4093   KMP_DEBUG_ASSERT(this_thr != NULL);
4094   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4095   KMP_DEBUG_ASSERT(team);
4096   KMP_DEBUG_ASSERT(team->t.t_threads);
4097   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4098   KMP_DEBUG_ASSERT(master);
4099   KMP_DEBUG_ASSERT(master->th.th_root);
4100 
4101   KMP_MB();
4102 
4103   TCW_SYNC_PTR(this_thr->th.th_team, team);
4104 
4105   this_thr->th.th_info.ds.ds_tid = tid;
4106   this_thr->th.th_set_nproc = 0;
4107   if (__kmp_tasking_mode != tskm_immediate_exec)
4108     // When tasking is possible, threads are not safe to reap until they are
4109     // done tasking; this will be set when tasking code is exited in wait
4110     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4111   else // no tasking --> always safe to reap
4112     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4113 #if OMP_40_ENABLED
4114   this_thr->th.th_set_proc_bind = proc_bind_default;
4115 #if KMP_AFFINITY_SUPPORTED
4116   this_thr->th.th_new_place = this_thr->th.th_current_place;
4117 #endif
4118 #endif
4119   this_thr->th.th_root = master->th.th_root;
4120 
4121   /* setup the thread's cache of the team structure */
4122   this_thr->th.th_team_nproc = team->t.t_nproc;
4123   this_thr->th.th_team_master = master;
4124   this_thr->th.th_team_serialized = team->t.t_serialized;
4125   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4126 
4127   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4128 
4129   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4130                 tid, gtid, this_thr, this_thr->th.th_current_task));
4131 
4132   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4133                            team, tid, TRUE);
4134 
4135   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4136                 tid, gtid, this_thr, this_thr->th.th_current_task));
4137   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4138   // __kmp_initialize_team()?
4139 
4140   /* TODO no worksharing in speculative threads */
4141   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4142 
4143   this_thr->th.th_local.this_construct = 0;
4144 
4145   if (!this_thr->th.th_pri_common) {
4146     this_thr->th.th_pri_common =
4147         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4148     if (__kmp_storage_map) {
4149       __kmp_print_storage_map_gtid(
4150           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4151           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4152     }
4153     this_thr->th.th_pri_head = NULL;
4154   }
4155 
4156   /* Initialize dynamic dispatch */
4157   {
4158     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4159     // Use team max_nproc since this will never change for the team.
4160     size_t disp_size =
4161         sizeof(dispatch_private_info_t) *
4162         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4163     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4164                   team->t.t_max_nproc));
4165     KMP_ASSERT(dispatch);
4166     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4167     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4168 
4169     dispatch->th_disp_index = 0;
4170 #if OMP_45_ENABLED
4171     dispatch->th_doacross_buf_idx = 0;
4172 #endif
4173     if (!dispatch->th_disp_buffer) {
4174       dispatch->th_disp_buffer =
4175           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4176 
4177       if (__kmp_storage_map) {
4178         __kmp_print_storage_map_gtid(
4179             gtid, &dispatch->th_disp_buffer[0],
4180             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4181                                           ? 1
4182                                           : __kmp_dispatch_num_buffers],
4183             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4184                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4185             gtid, team->t.t_id, gtid);
4186       }
4187     } else {
4188       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4189     }
4190 
4191     dispatch->th_dispatch_pr_current = 0;
4192     dispatch->th_dispatch_sh_current = 0;
4193 
4194     dispatch->th_deo_fcn = 0; /* ORDERED     */
4195     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4196   }
4197 
4198   this_thr->th.th_next_pool = NULL;
4199 
4200   if (!this_thr->th.th_task_state_memo_stack) {
4201     size_t i;
4202     this_thr->th.th_task_state_memo_stack =
4203         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4204     this_thr->th.th_task_state_top = 0;
4205     this_thr->th.th_task_state_stack_sz = 4;
4206     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4207          ++i) // zero init the stack
4208       this_thr->th.th_task_state_memo_stack[i] = 0;
4209   }
4210 
4211   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4212   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4213 
4214   KMP_MB();
4215 }
4216 
4217 /* allocate a new thread for the requesting team. this is only called from
4218    within a forkjoin critical section. we will first try to get an available
4219    thread from the thread pool. if none is available, we will fork a new one
4220    assuming we are able to create a new one. this should be assured, as the
4221    caller should check on this first. */
4222 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4223                                   int new_tid) {
4224   kmp_team_t *serial_team;
4225   kmp_info_t *new_thr;
4226   int new_gtid;
4227 
4228   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4229   KMP_DEBUG_ASSERT(root && team);
4230 #if !KMP_NESTED_HOT_TEAMS
4231   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4232 #endif
4233   KMP_MB();
4234 
4235   /* first, try to get one from the thread pool */
4236   if (__kmp_thread_pool) {
4237 
4238     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4239     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4240     if (new_thr == __kmp_thread_pool_insert_pt) {
4241       __kmp_thread_pool_insert_pt = NULL;
4242     }
4243     TCW_4(new_thr->th.th_in_pool, FALSE);
4244     // Don't touch th_active_in_pool or th_active.
4245     // The worker thread adjusts those flags as it sleeps/awakens.
4246     __kmp_thread_pool_nth--;
4247 
4248     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4249                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4250     KMP_ASSERT(!new_thr->th.th_team);
4251     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4252     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4253 
4254     /* setup the thread structure */
4255     __kmp_initialize_info(new_thr, team, new_tid,
4256                           new_thr->th.th_info.ds.ds_gtid);
4257     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4258 
4259     TCW_4(__kmp_nth, __kmp_nth + 1);
4260     root->r.r_cg_nthreads++;
4261 
4262     new_thr->th.th_task_state = 0;
4263     new_thr->th.th_task_state_top = 0;
4264     new_thr->th.th_task_state_stack_sz = 4;
4265 
4266 #ifdef KMP_ADJUST_BLOCKTIME
4267     /* Adjust blocktime back to zero if necessary */
4268     /* Middle initialization might not have occurred yet */
4269     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4270       if (__kmp_nth > __kmp_avail_proc) {
4271         __kmp_zero_bt = TRUE;
4272       }
4273     }
4274 #endif /* KMP_ADJUST_BLOCKTIME */
4275 
4276 #if KMP_DEBUG
4277     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4278     // KMP_BARRIER_PARENT_FLAG.
4279     int b;
4280     kmp_balign_t *balign = new_thr->th.th_bar;
4281     for (b = 0; b < bs_last_barrier; ++b)
4282       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4283 #endif
4284 
4285     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4286                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4287 
4288     KMP_MB();
4289     return new_thr;
4290   }
4291 
4292   /* no, well fork a new one */
4293   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4294   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4295 
4296 #if KMP_USE_MONITOR
4297   // If this is the first worker thread the RTL is creating, then also
4298   // launch the monitor thread.  We try to do this as early as possible.
4299   if (!TCR_4(__kmp_init_monitor)) {
4300     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4301     if (!TCR_4(__kmp_init_monitor)) {
4302       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4303       TCW_4(__kmp_init_monitor, 1);
4304       __kmp_create_monitor(&__kmp_monitor);
4305       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4306 #if KMP_OS_WINDOWS
4307       // AC: wait until monitor has started. This is a fix for CQ232808.
4308       // The reason is that if the library is loaded/unloaded in a loop with
4309       // small (parallel) work in between, then there is high probability that
4310       // monitor thread started after the library shutdown. At shutdown it is
4311       // too late to cope with the problem, because when the master is in
4312       // DllMain (process detach) the monitor has no chances to start (it is
4313       // blocked), and master has no means to inform the monitor that the
4314       // library has gone, because all the memory which the monitor can access
4315       // is going to be released/reset.
4316       while (TCR_4(__kmp_init_monitor) < 2) {
4317         KMP_YIELD(TRUE);
4318       }
4319       KF_TRACE(10, ("after monitor thread has started\n"));
4320 #endif
4321     }
4322     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4323   }
4324 #endif
4325 
4326   KMP_MB();
4327   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4328     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4329   }
4330 
4331   /* allocate space for it. */
4332   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4333 
4334   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4335 
4336   if (__kmp_storage_map) {
4337     __kmp_print_thread_storage_map(new_thr, new_gtid);
4338   }
4339 
4340   // add the reserve serialized team, initialized from the team's master thread
4341   {
4342     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4343     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4344     new_thr->th.th_serial_team = serial_team =
4345         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4346 #if OMPT_SUPPORT
4347                                           ompt_data_none, // root parallel id
4348 #endif
4349 #if OMP_40_ENABLED
4350                                           proc_bind_default,
4351 #endif
4352                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4353   }
4354   KMP_ASSERT(serial_team);
4355   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4356   // execution (it is unused for now).
4357   serial_team->t.t_threads[0] = new_thr;
4358   KF_TRACE(10,
4359            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4360             new_thr));
4361 
4362   /* setup the thread structures */
4363   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4364 
4365 #if USE_FAST_MEMORY
4366   __kmp_initialize_fast_memory(new_thr);
4367 #endif /* USE_FAST_MEMORY */
4368 
4369 #if KMP_USE_BGET
4370   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4371   __kmp_initialize_bget(new_thr);
4372 #endif
4373 
4374   __kmp_init_random(new_thr); // Initialize random number generator
4375 
4376   /* Initialize these only once when thread is grabbed for a team allocation */
4377   KA_TRACE(20,
4378            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4379             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4380 
4381   int b;
4382   kmp_balign_t *balign = new_thr->th.th_bar;
4383   for (b = 0; b < bs_last_barrier; ++b) {
4384     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4385     balign[b].bb.team = NULL;
4386     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4387     balign[b].bb.use_oncore_barrier = 0;
4388   }
4389 
4390   new_thr->th.th_spin_here = FALSE;
4391   new_thr->th.th_next_waiting = 0;
4392 #if KMP_OS_UNIX
4393   new_thr->th.th_blocking = false;
4394 #endif
4395 
4396 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4397   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4398   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4399   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4400   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4401 #endif
4402 #if OMP_50_ENABLED
4403   new_thr->th.th_def_allocator = __kmp_def_allocator;
4404   new_thr->th.th_prev_level = 0;
4405   new_thr->th.th_prev_num_threads = 1;
4406 #endif
4407 
4408   TCW_4(new_thr->th.th_in_pool, FALSE);
4409   new_thr->th.th_active_in_pool = FALSE;
4410   TCW_4(new_thr->th.th_active, TRUE);
4411 
4412   /* adjust the global counters */
4413   __kmp_all_nth++;
4414   __kmp_nth++;
4415 
4416   root->r.r_cg_nthreads++;
4417 
4418   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4419   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4420   if (__kmp_adjust_gtid_mode) {
4421     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4422       if (TCR_4(__kmp_gtid_mode) != 2) {
4423         TCW_4(__kmp_gtid_mode, 2);
4424       }
4425     } else {
4426       if (TCR_4(__kmp_gtid_mode) != 1) {
4427         TCW_4(__kmp_gtid_mode, 1);
4428       }
4429     }
4430   }
4431 
4432 #ifdef KMP_ADJUST_BLOCKTIME
4433   /* Adjust blocktime back to zero if necessary       */
4434   /* Middle initialization might not have occurred yet */
4435   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4436     if (__kmp_nth > __kmp_avail_proc) {
4437       __kmp_zero_bt = TRUE;
4438     }
4439   }
4440 #endif /* KMP_ADJUST_BLOCKTIME */
4441 
4442   /* actually fork it and create the new worker thread */
4443   KF_TRACE(
4444       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4445   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4446   KF_TRACE(10,
4447            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4448 
4449   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4450                 new_gtid));
4451   KMP_MB();
4452   return new_thr;
4453 }
4454 
4455 /* Reinitialize team for reuse.
4456    The hot team code calls this case at every fork barrier, so EPCC barrier
4457    test are extremely sensitive to changes in it, esp. writes to the team
4458    struct, which cause a cache invalidation in all threads.
4459    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4460 static void __kmp_reinitialize_team(kmp_team_t *team,
4461                                     kmp_internal_control_t *new_icvs,
4462                                     ident_t *loc) {
4463   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4464                 team->t.t_threads[0], team));
4465   KMP_DEBUG_ASSERT(team && new_icvs);
4466   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4467   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4468 
4469   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4470   // Copy ICVs to the master thread's implicit taskdata
4471   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4472   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4473 
4474   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4475                 team->t.t_threads[0], team));
4476 }
4477 
4478 /* Initialize the team data structure.
4479    This assumes the t_threads and t_max_nproc are already set.
4480    Also, we don't touch the arguments */
4481 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4482                                   kmp_internal_control_t *new_icvs,
4483                                   ident_t *loc) {
4484   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4485 
4486   /* verify */
4487   KMP_DEBUG_ASSERT(team);
4488   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4489   KMP_DEBUG_ASSERT(team->t.t_threads);
4490   KMP_MB();
4491 
4492   team->t.t_master_tid = 0; /* not needed */
4493   /* team->t.t_master_bar;        not needed */
4494   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4495   team->t.t_nproc = new_nproc;
4496 
4497   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4498   team->t.t_next_pool = NULL;
4499   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4500    * up hot team */
4501 
4502   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4503   team->t.t_invoke = NULL; /* not needed */
4504 
4505   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4506   team->t.t_sched.sched = new_icvs->sched.sched;
4507 
4508 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4509   team->t.t_fp_control_saved = FALSE; /* not needed */
4510   team->t.t_x87_fpu_control_word = 0; /* not needed */
4511   team->t.t_mxcsr = 0; /* not needed */
4512 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4513 
4514   team->t.t_construct = 0;
4515 
4516   team->t.t_ordered.dt.t_value = 0;
4517   team->t.t_master_active = FALSE;
4518 
4519   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4520 
4521 #ifdef KMP_DEBUG
4522   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4523 #endif
4524 #if KMP_OS_WINDOWS
4525   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4526 #endif
4527 
4528   team->t.t_control_stack_top = NULL;
4529 
4530   __kmp_reinitialize_team(team, new_icvs, loc);
4531 
4532   KMP_MB();
4533   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4534 }
4535 
4536 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4537 /* Sets full mask for thread and returns old mask, no changes to structures. */
4538 static void
4539 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4540   if (KMP_AFFINITY_CAPABLE()) {
4541     int status;
4542     if (old_mask != NULL) {
4543       status = __kmp_get_system_affinity(old_mask, TRUE);
4544       int error = errno;
4545       if (status != 0) {
4546         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4547                     __kmp_msg_null);
4548       }
4549     }
4550     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4551   }
4552 }
4553 #endif
4554 
4555 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4556 
4557 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4558 // It calculats the worker + master thread's partition based upon the parent
4559 // thread's partition, and binds each worker to a thread in their partition.
4560 // The master thread's partition should already include its current binding.
4561 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4562   // Copy the master thread's place partion to the team struct
4563   kmp_info_t *master_th = team->t.t_threads[0];
4564   KMP_DEBUG_ASSERT(master_th != NULL);
4565   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4566   int first_place = master_th->th.th_first_place;
4567   int last_place = master_th->th.th_last_place;
4568   int masters_place = master_th->th.th_current_place;
4569   team->t.t_first_place = first_place;
4570   team->t.t_last_place = last_place;
4571 
4572   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4573                 "bound to place %d partition = [%d,%d]\n",
4574                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4575                 team->t.t_id, masters_place, first_place, last_place));
4576 
4577   switch (proc_bind) {
4578 
4579   case proc_bind_default:
4580     // serial teams might have the proc_bind policy set to proc_bind_default. It
4581     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4582     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4583     break;
4584 
4585   case proc_bind_master: {
4586     int f;
4587     int n_th = team->t.t_nproc;
4588     for (f = 1; f < n_th; f++) {
4589       kmp_info_t *th = team->t.t_threads[f];
4590       KMP_DEBUG_ASSERT(th != NULL);
4591       th->th.th_first_place = first_place;
4592       th->th.th_last_place = last_place;
4593       th->th.th_new_place = masters_place;
4594 #if OMP_50_ENABLED
4595       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4596           team->t.t_display_affinity != 1) {
4597         team->t.t_display_affinity = 1;
4598       }
4599 #endif
4600 
4601       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4602                      "partition = [%d,%d]\n",
4603                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4604                      f, masters_place, first_place, last_place));
4605     }
4606   } break;
4607 
4608   case proc_bind_close: {
4609     int f;
4610     int n_th = team->t.t_nproc;
4611     int n_places;
4612     if (first_place <= last_place) {
4613       n_places = last_place - first_place + 1;
4614     } else {
4615       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4616     }
4617     if (n_th <= n_places) {
4618       int place = masters_place;
4619       for (f = 1; f < n_th; f++) {
4620         kmp_info_t *th = team->t.t_threads[f];
4621         KMP_DEBUG_ASSERT(th != NULL);
4622 
4623         if (place == last_place) {
4624           place = first_place;
4625         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4626           place = 0;
4627         } else {
4628           place++;
4629         }
4630         th->th.th_first_place = first_place;
4631         th->th.th_last_place = last_place;
4632         th->th.th_new_place = place;
4633 #if OMP_50_ENABLED
4634         if (__kmp_display_affinity && place != th->th.th_current_place &&
4635             team->t.t_display_affinity != 1) {
4636           team->t.t_display_affinity = 1;
4637         }
4638 #endif
4639 
4640         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4641                        "partition = [%d,%d]\n",
4642                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4643                        team->t.t_id, f, place, first_place, last_place));
4644       }
4645     } else {
4646       int S, rem, gap, s_count;
4647       S = n_th / n_places;
4648       s_count = 0;
4649       rem = n_th - (S * n_places);
4650       gap = rem > 0 ? n_places / rem : n_places;
4651       int place = masters_place;
4652       int gap_ct = gap;
4653       for (f = 0; f < n_th; f++) {
4654         kmp_info_t *th = team->t.t_threads[f];
4655         KMP_DEBUG_ASSERT(th != NULL);
4656 
4657         th->th.th_first_place = first_place;
4658         th->th.th_last_place = last_place;
4659         th->th.th_new_place = place;
4660 #if OMP_50_ENABLED
4661         if (__kmp_display_affinity && place != th->th.th_current_place &&
4662             team->t.t_display_affinity != 1) {
4663           team->t.t_display_affinity = 1;
4664         }
4665 #endif
4666         s_count++;
4667 
4668         if ((s_count == S) && rem && (gap_ct == gap)) {
4669           // do nothing, add an extra thread to place on next iteration
4670         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4671           // we added an extra thread to this place; move to next place
4672           if (place == last_place) {
4673             place = first_place;
4674           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675             place = 0;
4676           } else {
4677             place++;
4678           }
4679           s_count = 0;
4680           gap_ct = 1;
4681           rem--;
4682         } else if (s_count == S) { // place full; don't add extra
4683           if (place == last_place) {
4684             place = first_place;
4685           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4686             place = 0;
4687           } else {
4688             place++;
4689           }
4690           gap_ct++;
4691           s_count = 0;
4692         }
4693 
4694         KA_TRACE(100,
4695                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4696                   "partition = [%d,%d]\n",
4697                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4698                   th->th.th_new_place, first_place, last_place));
4699       }
4700       KMP_DEBUG_ASSERT(place == masters_place);
4701     }
4702   } break;
4703 
4704   case proc_bind_spread: {
4705     int f;
4706     int n_th = team->t.t_nproc;
4707     int n_places;
4708     int thidx;
4709     if (first_place <= last_place) {
4710       n_places = last_place - first_place + 1;
4711     } else {
4712       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4713     }
4714     if (n_th <= n_places) {
4715       int place = -1;
4716 
4717       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4718         int S = n_places / n_th;
4719         int s_count, rem, gap, gap_ct;
4720 
4721         place = masters_place;
4722         rem = n_places - n_th * S;
4723         gap = rem ? n_th / rem : 1;
4724         gap_ct = gap;
4725         thidx = n_th;
4726         if (update_master_only == 1)
4727           thidx = 1;
4728         for (f = 0; f < thidx; f++) {
4729           kmp_info_t *th = team->t.t_threads[f];
4730           KMP_DEBUG_ASSERT(th != NULL);
4731 
4732           th->th.th_first_place = place;
4733           th->th.th_new_place = place;
4734 #if OMP_50_ENABLED
4735           if (__kmp_display_affinity && place != th->th.th_current_place &&
4736               team->t.t_display_affinity != 1) {
4737             team->t.t_display_affinity = 1;
4738           }
4739 #endif
4740           s_count = 1;
4741           while (s_count < S) {
4742             if (place == last_place) {
4743               place = first_place;
4744             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4745               place = 0;
4746             } else {
4747               place++;
4748             }
4749             s_count++;
4750           }
4751           if (rem && (gap_ct == gap)) {
4752             if (place == last_place) {
4753               place = first_place;
4754             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4755               place = 0;
4756             } else {
4757               place++;
4758             }
4759             rem--;
4760             gap_ct = 0;
4761           }
4762           th->th.th_last_place = place;
4763           gap_ct++;
4764 
4765           if (place == last_place) {
4766             place = first_place;
4767           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4768             place = 0;
4769           } else {
4770             place++;
4771           }
4772 
4773           KA_TRACE(100,
4774                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4775                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4776                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4777                     f, th->th.th_new_place, th->th.th_first_place,
4778                     th->th.th_last_place, __kmp_affinity_num_masks));
4779         }
4780       } else {
4781         /* Having uniform space of available computation places I can create
4782            T partitions of round(P/T) size and put threads into the first
4783            place of each partition. */
4784         double current = static_cast<double>(masters_place);
4785         double spacing =
4786             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4787         int first, last;
4788         kmp_info_t *th;
4789 
4790         thidx = n_th + 1;
4791         if (update_master_only == 1)
4792           thidx = 1;
4793         for (f = 0; f < thidx; f++) {
4794           first = static_cast<int>(current);
4795           last = static_cast<int>(current + spacing) - 1;
4796           KMP_DEBUG_ASSERT(last >= first);
4797           if (first >= n_places) {
4798             if (masters_place) {
4799               first -= n_places;
4800               last -= n_places;
4801               if (first == (masters_place + 1)) {
4802                 KMP_DEBUG_ASSERT(f == n_th);
4803                 first--;
4804               }
4805               if (last == masters_place) {
4806                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4807                 last--;
4808               }
4809             } else {
4810               KMP_DEBUG_ASSERT(f == n_th);
4811               first = 0;
4812               last = 0;
4813             }
4814           }
4815           if (last >= n_places) {
4816             last = (n_places - 1);
4817           }
4818           place = first;
4819           current += spacing;
4820           if (f < n_th) {
4821             KMP_DEBUG_ASSERT(0 <= first);
4822             KMP_DEBUG_ASSERT(n_places > first);
4823             KMP_DEBUG_ASSERT(0 <= last);
4824             KMP_DEBUG_ASSERT(n_places > last);
4825             KMP_DEBUG_ASSERT(last_place >= first_place);
4826             th = team->t.t_threads[f];
4827             KMP_DEBUG_ASSERT(th);
4828             th->th.th_first_place = first;
4829             th->th.th_new_place = place;
4830             th->th.th_last_place = last;
4831 #if OMP_50_ENABLED
4832             if (__kmp_display_affinity && place != th->th.th_current_place &&
4833                 team->t.t_display_affinity != 1) {
4834               team->t.t_display_affinity = 1;
4835             }
4836 #endif
4837             KA_TRACE(100,
4838                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4839                       "partition = [%d,%d], spacing = %.4f\n",
4840                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4841                       team->t.t_id, f, th->th.th_new_place,
4842                       th->th.th_first_place, th->th.th_last_place, spacing));
4843           }
4844         }
4845       }
4846       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4847     } else {
4848       int S, rem, gap, s_count;
4849       S = n_th / n_places;
4850       s_count = 0;
4851       rem = n_th - (S * n_places);
4852       gap = rem > 0 ? n_places / rem : n_places;
4853       int place = masters_place;
4854       int gap_ct = gap;
4855       thidx = n_th;
4856       if (update_master_only == 1)
4857         thidx = 1;
4858       for (f = 0; f < thidx; f++) {
4859         kmp_info_t *th = team->t.t_threads[f];
4860         KMP_DEBUG_ASSERT(th != NULL);
4861 
4862         th->th.th_first_place = place;
4863         th->th.th_last_place = place;
4864         th->th.th_new_place = place;
4865 #if OMP_50_ENABLED
4866         if (__kmp_display_affinity && place != th->th.th_current_place &&
4867             team->t.t_display_affinity != 1) {
4868           team->t.t_display_affinity = 1;
4869         }
4870 #endif
4871         s_count++;
4872 
4873         if ((s_count == S) && rem && (gap_ct == gap)) {
4874           // do nothing, add an extra thread to place on next iteration
4875         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4876           // we added an extra thread to this place; move on to next place
4877           if (place == last_place) {
4878             place = first_place;
4879           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4880             place = 0;
4881           } else {
4882             place++;
4883           }
4884           s_count = 0;
4885           gap_ct = 1;
4886           rem--;
4887         } else if (s_count == S) { // place is full; don't add extra thread
4888           if (place == last_place) {
4889             place = first_place;
4890           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4891             place = 0;
4892           } else {
4893             place++;
4894           }
4895           gap_ct++;
4896           s_count = 0;
4897         }
4898 
4899         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4900                        "partition = [%d,%d]\n",
4901                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4902                        team->t.t_id, f, th->th.th_new_place,
4903                        th->th.th_first_place, th->th.th_last_place));
4904       }
4905       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4906     }
4907   } break;
4908 
4909   default:
4910     break;
4911   }
4912 
4913   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4914 }
4915 
4916 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4917 
4918 /* allocate a new team data structure to use.  take one off of the free pool if
4919    available */
4920 kmp_team_t *
4921 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4922 #if OMPT_SUPPORT
4923                     ompt_data_t ompt_parallel_data,
4924 #endif
4925 #if OMP_40_ENABLED
4926                     kmp_proc_bind_t new_proc_bind,
4927 #endif
4928                     kmp_internal_control_t *new_icvs,
4929                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4930   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4931   int f;
4932   kmp_team_t *team;
4933   int use_hot_team = !root->r.r_active;
4934   int level = 0;
4935 
4936   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4937   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4938   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4939   KMP_MB();
4940 
4941 #if KMP_NESTED_HOT_TEAMS
4942   kmp_hot_team_ptr_t *hot_teams;
4943   if (master) {
4944     team = master->th.th_team;
4945     level = team->t.t_active_level;
4946     if (master->th.th_teams_microtask) { // in teams construct?
4947       if (master->th.th_teams_size.nteams > 1 &&
4948           ( // #teams > 1
4949               team->t.t_pkfn ==
4950                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4951               master->th.th_teams_level <
4952                   team->t.t_level)) { // or nested parallel inside the teams
4953         ++level; // not increment if #teams==1, or for outer fork of the teams;
4954         // increment otherwise
4955       }
4956     }
4957     hot_teams = master->th.th_hot_teams;
4958     if (level < __kmp_hot_teams_max_level && hot_teams &&
4959         hot_teams[level]
4960             .hot_team) { // hot team has already been allocated for given level
4961       use_hot_team = 1;
4962     } else {
4963       use_hot_team = 0;
4964     }
4965   }
4966 #endif
4967   // Optimization to use a "hot" team
4968   if (use_hot_team && new_nproc > 1) {
4969     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4970 #if KMP_NESTED_HOT_TEAMS
4971     team = hot_teams[level].hot_team;
4972 #else
4973     team = root->r.r_hot_team;
4974 #endif
4975 #if KMP_DEBUG
4976     if (__kmp_tasking_mode != tskm_immediate_exec) {
4977       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4978                     "task_team[1] = %p before reinit\n",
4979                     team->t.t_task_team[0], team->t.t_task_team[1]));
4980     }
4981 #endif
4982 
4983     // Has the number of threads changed?
4984     /* Let's assume the most common case is that the number of threads is
4985        unchanged, and put that case first. */
4986     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4987       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4988       // This case can mean that omp_set_num_threads() was called and the hot
4989       // team size was already reduced, so we check the special flag
4990       if (team->t.t_size_changed == -1) {
4991         team->t.t_size_changed = 1;
4992       } else {
4993         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4994       }
4995 
4996       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4997       kmp_r_sched_t new_sched = new_icvs->sched;
4998       // set master's schedule as new run-time schedule
4999       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5000 
5001       __kmp_reinitialize_team(team, new_icvs,
5002                               root->r.r_uber_thread->th.th_ident);
5003 
5004       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5005                     team->t.t_threads[0], team));
5006       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5007 
5008 #if OMP_40_ENABLED
5009 #if KMP_AFFINITY_SUPPORTED
5010       if ((team->t.t_size_changed == 0) &&
5011           (team->t.t_proc_bind == new_proc_bind)) {
5012         if (new_proc_bind == proc_bind_spread) {
5013           __kmp_partition_places(
5014               team, 1); // add flag to update only master for spread
5015         }
5016         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5017                        "proc_bind = %d, partition = [%d,%d]\n",
5018                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5019                        team->t.t_last_place));
5020       } else {
5021         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5022         __kmp_partition_places(team);
5023       }
5024 #else
5025       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5026 #endif /* KMP_AFFINITY_SUPPORTED */
5027 #endif /* OMP_40_ENABLED */
5028     } else if (team->t.t_nproc > new_nproc) {
5029       KA_TRACE(20,
5030                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5031                 new_nproc));
5032 
5033       team->t.t_size_changed = 1;
5034 #if KMP_NESTED_HOT_TEAMS
5035       if (__kmp_hot_teams_mode == 0) {
5036         // AC: saved number of threads should correspond to team's value in this
5037         // mode, can be bigger in mode 1, when hot team has threads in reserve
5038         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5039         hot_teams[level].hot_team_nth = new_nproc;
5040 #endif // KMP_NESTED_HOT_TEAMS
5041         /* release the extra threads we don't need any more */
5042         for (f = new_nproc; f < team->t.t_nproc; f++) {
5043           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5044           if (__kmp_tasking_mode != tskm_immediate_exec) {
5045             // When decreasing team size, threads no longer in the team should
5046             // unref task team.
5047             team->t.t_threads[f]->th.th_task_team = NULL;
5048           }
5049           __kmp_free_thread(team->t.t_threads[f]);
5050           team->t.t_threads[f] = NULL;
5051         }
5052 #if KMP_NESTED_HOT_TEAMS
5053       } // (__kmp_hot_teams_mode == 0)
5054       else {
5055         // When keeping extra threads in team, switch threads to wait on own
5056         // b_go flag
5057         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5058           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5059           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5060           for (int b = 0; b < bs_last_barrier; ++b) {
5061             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5062               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5063             }
5064             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5065           }
5066         }
5067       }
5068 #endif // KMP_NESTED_HOT_TEAMS
5069       team->t.t_nproc = new_nproc;
5070       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5071       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5072       __kmp_reinitialize_team(team, new_icvs,
5073                               root->r.r_uber_thread->th.th_ident);
5074 
5075       /* update the remaining threads */
5076       for (f = 0; f < new_nproc; ++f) {
5077         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5078       }
5079       // restore the current task state of the master thread: should be the
5080       // implicit task
5081       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5082                     team->t.t_threads[0], team));
5083 
5084       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5085 
5086 #ifdef KMP_DEBUG
5087       for (f = 0; f < team->t.t_nproc; f++) {
5088         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5089                          team->t.t_threads[f]->th.th_team_nproc ==
5090                              team->t.t_nproc);
5091       }
5092 #endif
5093 
5094 #if OMP_40_ENABLED
5095       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5096 #if KMP_AFFINITY_SUPPORTED
5097       __kmp_partition_places(team);
5098 #endif
5099 #endif
5100     } else { // team->t.t_nproc < new_nproc
5101 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5102       kmp_affin_mask_t *old_mask;
5103       if (KMP_AFFINITY_CAPABLE()) {
5104         KMP_CPU_ALLOC(old_mask);
5105       }
5106 #endif
5107 
5108       KA_TRACE(20,
5109                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5110                 new_nproc));
5111 
5112       team->t.t_size_changed = 1;
5113 
5114 #if KMP_NESTED_HOT_TEAMS
5115       int avail_threads = hot_teams[level].hot_team_nth;
5116       if (new_nproc < avail_threads)
5117         avail_threads = new_nproc;
5118       kmp_info_t **other_threads = team->t.t_threads;
5119       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5120         // Adjust barrier data of reserved threads (if any) of the team
5121         // Other data will be set in __kmp_initialize_info() below.
5122         int b;
5123         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5124         for (b = 0; b < bs_last_barrier; ++b) {
5125           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5126           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5127 #if USE_DEBUGGER
5128           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5129 #endif
5130         }
5131       }
5132       if (hot_teams[level].hot_team_nth >= new_nproc) {
5133         // we have all needed threads in reserve, no need to allocate any
5134         // this only possible in mode 1, cannot have reserved threads in mode 0
5135         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5136         team->t.t_nproc = new_nproc; // just get reserved threads involved
5137       } else {
5138         // we may have some threads in reserve, but not enough
5139         team->t.t_nproc =
5140             hot_teams[level]
5141                 .hot_team_nth; // get reserved threads involved if any
5142         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5143 #endif // KMP_NESTED_HOT_TEAMS
5144         if (team->t.t_max_nproc < new_nproc) {
5145           /* reallocate larger arrays */
5146           __kmp_reallocate_team_arrays(team, new_nproc);
5147           __kmp_reinitialize_team(team, new_icvs, NULL);
5148         }
5149 
5150 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5151         /* Temporarily set full mask for master thread before creation of
5152            workers. The reason is that workers inherit the affinity from master,
5153            so if a lot of workers are created on the single core quickly, they
5154            don't get a chance to set their own affinity for a long time. */
5155         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5156 #endif
5157 
5158         /* allocate new threads for the hot team */
5159         for (f = team->t.t_nproc; f < new_nproc; f++) {
5160           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5161           KMP_DEBUG_ASSERT(new_worker);
5162           team->t.t_threads[f] = new_worker;
5163 
5164           KA_TRACE(20,
5165                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5166                     "join=%llu, plain=%llu\n",
5167                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5168                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5169                     team->t.t_bar[bs_plain_barrier].b_arrived));
5170 
5171           { // Initialize barrier data for new threads.
5172             int b;
5173             kmp_balign_t *balign = new_worker->th.th_bar;
5174             for (b = 0; b < bs_last_barrier; ++b) {
5175               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5176               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5177                                KMP_BARRIER_PARENT_FLAG);
5178 #if USE_DEBUGGER
5179               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5180 #endif
5181             }
5182           }
5183         }
5184 
5185 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5186         if (KMP_AFFINITY_CAPABLE()) {
5187           /* Restore initial master thread's affinity mask */
5188           __kmp_set_system_affinity(old_mask, TRUE);
5189           KMP_CPU_FREE(old_mask);
5190         }
5191 #endif
5192 #if KMP_NESTED_HOT_TEAMS
5193       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5194 #endif // KMP_NESTED_HOT_TEAMS
5195       /* make sure everyone is syncronized */
5196       int old_nproc = team->t.t_nproc; // save old value and use to update only
5197       // new threads below
5198       __kmp_initialize_team(team, new_nproc, new_icvs,
5199                             root->r.r_uber_thread->th.th_ident);
5200 
5201       /* reinitialize the threads */
5202       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5203       for (f = 0; f < team->t.t_nproc; ++f)
5204         __kmp_initialize_info(team->t.t_threads[f], team, f,
5205                               __kmp_gtid_from_tid(f, team));
5206       if (level) { // set th_task_state for new threads in nested hot team
5207         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5208         // only need to set the th_task_state for the new threads. th_task_state
5209         // for master thread will not be accurate until after this in
5210         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5211         // correct value.
5212         for (f = old_nproc; f < team->t.t_nproc; ++f)
5213           team->t.t_threads[f]->th.th_task_state =
5214               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5215       } else { // set th_task_state for new threads in non-nested hot team
5216         int old_state =
5217             team->t.t_threads[0]->th.th_task_state; // copy master's state
5218         for (f = old_nproc; f < team->t.t_nproc; ++f)
5219           team->t.t_threads[f]->th.th_task_state = old_state;
5220       }
5221 
5222 #ifdef KMP_DEBUG
5223       for (f = 0; f < team->t.t_nproc; ++f) {
5224         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5225                          team->t.t_threads[f]->th.th_team_nproc ==
5226                              team->t.t_nproc);
5227       }
5228 #endif
5229 
5230 #if OMP_40_ENABLED
5231       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5232 #if KMP_AFFINITY_SUPPORTED
5233       __kmp_partition_places(team);
5234 #endif
5235 #endif
5236     } // Check changes in number of threads
5237 
5238 #if OMP_40_ENABLED
5239     kmp_info_t *master = team->t.t_threads[0];
5240     if (master->th.th_teams_microtask) {
5241       for (f = 1; f < new_nproc; ++f) {
5242         // propagate teams construct specific info to workers
5243         kmp_info_t *thr = team->t.t_threads[f];
5244         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5245         thr->th.th_teams_level = master->th.th_teams_level;
5246         thr->th.th_teams_size = master->th.th_teams_size;
5247       }
5248     }
5249 #endif /* OMP_40_ENABLED */
5250 #if KMP_NESTED_HOT_TEAMS
5251     if (level) {
5252       // Sync barrier state for nested hot teams, not needed for outermost hot
5253       // team.
5254       for (f = 1; f < new_nproc; ++f) {
5255         kmp_info_t *thr = team->t.t_threads[f];
5256         int b;
5257         kmp_balign_t *balign = thr->th.th_bar;
5258         for (b = 0; b < bs_last_barrier; ++b) {
5259           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5260           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5261 #if USE_DEBUGGER
5262           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5263 #endif
5264         }
5265       }
5266     }
5267 #endif // KMP_NESTED_HOT_TEAMS
5268 
5269     /* reallocate space for arguments if necessary */
5270     __kmp_alloc_argv_entries(argc, team, TRUE);
5271     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5272     // The hot team re-uses the previous task team,
5273     // if untouched during the previous release->gather phase.
5274 
5275     KF_TRACE(10, (" hot_team = %p\n", team));
5276 
5277 #if KMP_DEBUG
5278     if (__kmp_tasking_mode != tskm_immediate_exec) {
5279       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5280                     "task_team[1] = %p after reinit\n",
5281                     team->t.t_task_team[0], team->t.t_task_team[1]));
5282     }
5283 #endif
5284 
5285 #if OMPT_SUPPORT
5286     __ompt_team_assign_id(team, ompt_parallel_data);
5287 #endif
5288 
5289     KMP_MB();
5290 
5291     return team;
5292   }
5293 
5294   /* next, let's try to take one from the team pool */
5295   KMP_MB();
5296   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5297     /* TODO: consider resizing undersized teams instead of reaping them, now
5298        that we have a resizing mechanism */
5299     if (team->t.t_max_nproc >= max_nproc) {
5300       /* take this team from the team pool */
5301       __kmp_team_pool = team->t.t_next_pool;
5302 
5303       /* setup the team for fresh use */
5304       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5305 
5306       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5307                     "task_team[1] %p to NULL\n",
5308                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5309       team->t.t_task_team[0] = NULL;
5310       team->t.t_task_team[1] = NULL;
5311 
5312       /* reallocate space for arguments if necessary */
5313       __kmp_alloc_argv_entries(argc, team, TRUE);
5314       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5315 
5316       KA_TRACE(
5317           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5318                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5319       { // Initialize barrier data.
5320         int b;
5321         for (b = 0; b < bs_last_barrier; ++b) {
5322           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5323 #if USE_DEBUGGER
5324           team->t.t_bar[b].b_master_arrived = 0;
5325           team->t.t_bar[b].b_team_arrived = 0;
5326 #endif
5327         }
5328       }
5329 
5330 #if OMP_40_ENABLED
5331       team->t.t_proc_bind = new_proc_bind;
5332 #endif
5333 
5334       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5335                     team->t.t_id));
5336 
5337 #if OMPT_SUPPORT
5338       __ompt_team_assign_id(team, ompt_parallel_data);
5339 #endif
5340 
5341       KMP_MB();
5342 
5343       return team;
5344     }
5345 
5346     /* reap team if it is too small, then loop back and check the next one */
5347     // not sure if this is wise, but, will be redone during the hot-teams
5348     // rewrite.
5349     /* TODO: Use technique to find the right size hot-team, don't reap them */
5350     team = __kmp_reap_team(team);
5351     __kmp_team_pool = team;
5352   }
5353 
5354   /* nothing available in the pool, no matter, make a new team! */
5355   KMP_MB();
5356   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5357 
5358   /* and set it up */
5359   team->t.t_max_nproc = max_nproc;
5360   /* NOTE well, for some reason allocating one big buffer and dividing it up
5361      seems to really hurt performance a lot on the P4, so, let's not use this */
5362   __kmp_allocate_team_arrays(team, max_nproc);
5363 
5364   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5365   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5366 
5367   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5368                 "%p to NULL\n",
5369                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5370   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5371   // memory, no need to duplicate
5372   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5373   // memory, no need to duplicate
5374 
5375   if (__kmp_storage_map) {
5376     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5377   }
5378 
5379   /* allocate space for arguments */
5380   __kmp_alloc_argv_entries(argc, team, FALSE);
5381   team->t.t_argc = argc;
5382 
5383   KA_TRACE(20,
5384            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5385             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5386   { // Initialize barrier data.
5387     int b;
5388     for (b = 0; b < bs_last_barrier; ++b) {
5389       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5390 #if USE_DEBUGGER
5391       team->t.t_bar[b].b_master_arrived = 0;
5392       team->t.t_bar[b].b_team_arrived = 0;
5393 #endif
5394     }
5395   }
5396 
5397 #if OMP_40_ENABLED
5398   team->t.t_proc_bind = new_proc_bind;
5399 #endif
5400 
5401 #if OMPT_SUPPORT
5402   __ompt_team_assign_id(team, ompt_parallel_data);
5403   team->t.ompt_serialized_team_info = NULL;
5404 #endif
5405 
5406   KMP_MB();
5407 
5408   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5409                 team->t.t_id));
5410 
5411   return team;
5412 }
5413 
5414 /* TODO implement hot-teams at all levels */
5415 /* TODO implement lazy thread release on demand (disband request) */
5416 
5417 /* free the team.  return it to the team pool.  release all the threads
5418  * associated with it */
5419 void __kmp_free_team(kmp_root_t *root,
5420                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5421   int f;
5422   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5423                 team->t.t_id));
5424 
5425   /* verify state */
5426   KMP_DEBUG_ASSERT(root);
5427   KMP_DEBUG_ASSERT(team);
5428   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5429   KMP_DEBUG_ASSERT(team->t.t_threads);
5430 
5431   int use_hot_team = team == root->r.r_hot_team;
5432 #if KMP_NESTED_HOT_TEAMS
5433   int level;
5434   kmp_hot_team_ptr_t *hot_teams;
5435   if (master) {
5436     level = team->t.t_active_level - 1;
5437     if (master->th.th_teams_microtask) { // in teams construct?
5438       if (master->th.th_teams_size.nteams > 1) {
5439         ++level; // level was not increased in teams construct for
5440         // team_of_masters
5441       }
5442       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5443           master->th.th_teams_level == team->t.t_level) {
5444         ++level; // level was not increased in teams construct for
5445         // team_of_workers before the parallel
5446       } // team->t.t_level will be increased inside parallel
5447     }
5448     hot_teams = master->th.th_hot_teams;
5449     if (level < __kmp_hot_teams_max_level) {
5450       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5451       use_hot_team = 1;
5452     }
5453   }
5454 #endif // KMP_NESTED_HOT_TEAMS
5455 
5456   /* team is done working */
5457   TCW_SYNC_PTR(team->t.t_pkfn,
5458                NULL); // Important for Debugging Support Library.
5459 #if KMP_OS_WINDOWS
5460   team->t.t_copyin_counter = 0; // init counter for possible reuse
5461 #endif
5462   // Do not reset pointer to parent team to NULL for hot teams.
5463 
5464   /* if we are non-hot team, release our threads */
5465   if (!use_hot_team) {
5466     if (__kmp_tasking_mode != tskm_immediate_exec) {
5467       // Wait for threads to reach reapable state
5468       for (f = 1; f < team->t.t_nproc; ++f) {
5469         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5470         kmp_info_t *th = team->t.t_threads[f];
5471         volatile kmp_uint32 *state = &th->th.th_reap_state;
5472         while (*state != KMP_SAFE_TO_REAP) {
5473 #if KMP_OS_WINDOWS
5474           // On Windows a thread can be killed at any time, check this
5475           DWORD ecode;
5476           if (!__kmp_is_thread_alive(th, &ecode)) {
5477             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5478             break;
5479           }
5480 #endif
5481           // first check if thread is sleeping
5482           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5483           if (fl.is_sleeping())
5484             fl.resume(__kmp_gtid_from_thread(th));
5485           KMP_CPU_PAUSE();
5486         }
5487       }
5488 
5489       // Delete task teams
5490       int tt_idx;
5491       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5492         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5493         if (task_team != NULL) {
5494           for (f = 0; f < team->t.t_nproc;
5495                ++f) { // Have all threads unref task teams
5496             team->t.t_threads[f]->th.th_task_team = NULL;
5497           }
5498           KA_TRACE(
5499               20,
5500               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5501                __kmp_get_gtid(), task_team, team->t.t_id));
5502 #if KMP_NESTED_HOT_TEAMS
5503           __kmp_free_task_team(master, task_team);
5504 #endif
5505           team->t.t_task_team[tt_idx] = NULL;
5506         }
5507       }
5508     }
5509 
5510     // Reset pointer to parent team only for non-hot teams.
5511     team->t.t_parent = NULL;
5512     team->t.t_level = 0;
5513     team->t.t_active_level = 0;
5514 
5515     /* free the worker threads */
5516     for (f = 1; f < team->t.t_nproc; ++f) {
5517       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5518       __kmp_free_thread(team->t.t_threads[f]);
5519       team->t.t_threads[f] = NULL;
5520     }
5521 
5522     /* put the team back in the team pool */
5523     /* TODO limit size of team pool, call reap_team if pool too large */
5524     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5525     __kmp_team_pool = (volatile kmp_team_t *)team;
5526   }
5527 
5528   KMP_MB();
5529 }
5530 
5531 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5532 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5533   kmp_team_t *next_pool = team->t.t_next_pool;
5534 
5535   KMP_DEBUG_ASSERT(team);
5536   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5537   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5538   KMP_DEBUG_ASSERT(team->t.t_threads);
5539   KMP_DEBUG_ASSERT(team->t.t_argv);
5540 
5541   /* TODO clean the threads that are a part of this? */
5542 
5543   /* free stuff */
5544   __kmp_free_team_arrays(team);
5545   if (team->t.t_argv != &team->t.t_inline_argv[0])
5546     __kmp_free((void *)team->t.t_argv);
5547   __kmp_free(team);
5548 
5549   KMP_MB();
5550   return next_pool;
5551 }
5552 
5553 // Free the thread.  Don't reap it, just place it on the pool of available
5554 // threads.
5555 //
5556 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5557 // binding for the affinity mechanism to be useful.
5558 //
5559 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5560 // However, we want to avoid a potential performance problem by always
5561 // scanning through the list to find the correct point at which to insert
5562 // the thread (potential N**2 behavior).  To do this we keep track of the
5563 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5564 // With single-level parallelism, threads will always be added to the tail
5565 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5566 // parallelism, all bets are off and we may need to scan through the entire
5567 // free list.
5568 //
5569 // This change also has a potentially large performance benefit, for some
5570 // applications.  Previously, as threads were freed from the hot team, they
5571 // would be placed back on the free list in inverse order.  If the hot team
5572 // grew back to it's original size, then the freed thread would be placed
5573 // back on the hot team in reverse order.  This could cause bad cache
5574 // locality problems on programs where the size of the hot team regularly
5575 // grew and shrunk.
5576 //
5577 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5578 void __kmp_free_thread(kmp_info_t *this_th) {
5579   int gtid;
5580   kmp_info_t **scan;
5581   kmp_root_t *root = this_th->th.th_root;
5582 
5583   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5584                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5585 
5586   KMP_DEBUG_ASSERT(this_th);
5587 
5588   // When moving thread to pool, switch thread to wait on own b_go flag, and
5589   // uninitialized (NULL team).
5590   int b;
5591   kmp_balign_t *balign = this_th->th.th_bar;
5592   for (b = 0; b < bs_last_barrier; ++b) {
5593     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5594       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5595     balign[b].bb.team = NULL;
5596     balign[b].bb.leaf_kids = 0;
5597   }
5598   this_th->th.th_task_state = 0;
5599   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5600 
5601   /* put thread back on the free pool */
5602   TCW_PTR(this_th->th.th_team, NULL);
5603   TCW_PTR(this_th->th.th_root, NULL);
5604   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5605 
5606   /* If the implicit task assigned to this thread can be used by other threads
5607    * -> multiple threads can share the data and try to free the task at
5608    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5609    * with higher probability when hot team is disabled but can occurs even when
5610    * the hot team is enabled */
5611   __kmp_free_implicit_task(this_th);
5612   this_th->th.th_current_task = NULL;
5613 
5614   // If the __kmp_thread_pool_insert_pt is already past the new insert
5615   // point, then we need to re-scan the entire list.
5616   gtid = this_th->th.th_info.ds.ds_gtid;
5617   if (__kmp_thread_pool_insert_pt != NULL) {
5618     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5619     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5620       __kmp_thread_pool_insert_pt = NULL;
5621     }
5622   }
5623 
5624   // Scan down the list to find the place to insert the thread.
5625   // scan is the address of a link in the list, possibly the address of
5626   // __kmp_thread_pool itself.
5627   //
5628   // In the absence of nested parallism, the for loop will have 0 iterations.
5629   if (__kmp_thread_pool_insert_pt != NULL) {
5630     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5631   } else {
5632     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5633   }
5634   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5635        scan = &((*scan)->th.th_next_pool))
5636     ;
5637 
5638   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5639   // to its address.
5640   TCW_PTR(this_th->th.th_next_pool, *scan);
5641   __kmp_thread_pool_insert_pt = *scan = this_th;
5642   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5643                    (this_th->th.th_info.ds.ds_gtid <
5644                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5645   TCW_4(this_th->th.th_in_pool, TRUE);
5646   __kmp_thread_pool_nth++;
5647 
5648   TCW_4(__kmp_nth, __kmp_nth - 1);
5649   root->r.r_cg_nthreads--;
5650 
5651 #ifdef KMP_ADJUST_BLOCKTIME
5652   /* Adjust blocktime back to user setting or default if necessary */
5653   /* Middle initialization might never have occurred                */
5654   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5655     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5656     if (__kmp_nth <= __kmp_avail_proc) {
5657       __kmp_zero_bt = FALSE;
5658     }
5659   }
5660 #endif /* KMP_ADJUST_BLOCKTIME */
5661 
5662   KMP_MB();
5663 }
5664 
5665 /* ------------------------------------------------------------------------ */
5666 
5667 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5668   int gtid = this_thr->th.th_info.ds.ds_gtid;
5669   /*    void                 *stack_data;*/
5670   kmp_team_t *(*volatile pteam);
5671 
5672   KMP_MB();
5673   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5674 
5675   if (__kmp_env_consistency_check) {
5676     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5677   }
5678 
5679 #if OMPT_SUPPORT
5680   ompt_data_t *thread_data;
5681   if (ompt_enabled.enabled) {
5682     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5683     *thread_data = ompt_data_none;
5684 
5685     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5686     this_thr->th.ompt_thread_info.wait_id = 0;
5687     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5688     if (ompt_enabled.ompt_callback_thread_begin) {
5689       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5690           ompt_thread_worker, thread_data);
5691     }
5692   }
5693 #endif
5694 
5695 #if OMPT_SUPPORT
5696   if (ompt_enabled.enabled) {
5697     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5698   }
5699 #endif
5700   /* This is the place where threads wait for work */
5701   while (!TCR_4(__kmp_global.g.g_done)) {
5702     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5703     KMP_MB();
5704 
5705     /* wait for work to do */
5706     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5707 
5708     /* No tid yet since not part of a team */
5709     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5710 
5711 #if OMPT_SUPPORT
5712     if (ompt_enabled.enabled) {
5713       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5714     }
5715 #endif
5716 
5717     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5718 
5719     /* have we been allocated? */
5720     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5721       /* we were just woken up, so run our new task */
5722       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5723         int rc;
5724         KA_TRACE(20,
5725                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5726                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5727                   (*pteam)->t.t_pkfn));
5728 
5729         updateHWFPControl(*pteam);
5730 
5731 #if OMPT_SUPPORT
5732         if (ompt_enabled.enabled) {
5733           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5734         }
5735 #endif
5736 
5737         rc = (*pteam)->t.t_invoke(gtid);
5738         KMP_ASSERT(rc);
5739 
5740         KMP_MB();
5741         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5742                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5743                       (*pteam)->t.t_pkfn));
5744       }
5745 #if OMPT_SUPPORT
5746       if (ompt_enabled.enabled) {
5747         /* no frame set while outside task */
5748         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5749 
5750         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5751       }
5752 #endif
5753       /* join barrier after parallel region */
5754       __kmp_join_barrier(gtid);
5755     }
5756   }
5757   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5758 
5759 #if OMPT_SUPPORT
5760   if (ompt_enabled.ompt_callback_thread_end) {
5761     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5762   }
5763 #endif
5764 
5765   this_thr->th.th_task_team = NULL;
5766   /* run the destructors for the threadprivate data for this thread */
5767   __kmp_common_destroy_gtid(gtid);
5768 
5769   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5770   KMP_MB();
5771   return this_thr;
5772 }
5773 
5774 /* ------------------------------------------------------------------------ */
5775 
5776 void __kmp_internal_end_dest(void *specific_gtid) {
5777 #if KMP_COMPILER_ICC
5778 #pragma warning(push)
5779 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5780 // significant bits
5781 #endif
5782   // Make sure no significant bits are lost
5783   int gtid = (kmp_intptr_t)specific_gtid - 1;
5784 #if KMP_COMPILER_ICC
5785 #pragma warning(pop)
5786 #endif
5787 
5788   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5789   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5790    * this is because 0 is reserved for the nothing-stored case */
5791 
5792   /* josh: One reason for setting the gtid specific data even when it is being
5793      destroyed by pthread is to allow gtid lookup through thread specific data
5794      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5795      that gets executed in the call to __kmp_internal_end_thread, actually
5796      gets the gtid through the thread specific data.  Setting it here seems
5797      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5798      to run smoothly.
5799      todo: get rid of this after we remove the dependence on
5800      __kmp_gtid_get_specific  */
5801   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5802     __kmp_gtid_set_specific(gtid);
5803 #ifdef KMP_TDATA_GTID
5804   __kmp_gtid = gtid;
5805 #endif
5806   __kmp_internal_end_thread(gtid);
5807 }
5808 
5809 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5810 
5811 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5812 // destructors work perfectly, but in real libomp.so I have no evidence it is
5813 // ever called. However, -fini linker option in makefile.mk works fine.
5814 
5815 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5816   __kmp_internal_end_atexit();
5817 }
5818 
5819 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5820 
5821 #endif
5822 
5823 /* [Windows] josh: when the atexit handler is called, there may still be more
5824    than one thread alive */
5825 void __kmp_internal_end_atexit(void) {
5826   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5827   /* [Windows]
5828      josh: ideally, we want to completely shutdown the library in this atexit
5829      handler, but stat code that depends on thread specific data for gtid fails
5830      because that data becomes unavailable at some point during the shutdown, so
5831      we call __kmp_internal_end_thread instead. We should eventually remove the
5832      dependency on __kmp_get_specific_gtid in the stat code and use
5833      __kmp_internal_end_library to cleanly shutdown the library.
5834 
5835      // TODO: Can some of this comment about GVS be removed?
5836      I suspect that the offending stat code is executed when the calling thread
5837      tries to clean up a dead root thread's data structures, resulting in GVS
5838      code trying to close the GVS structures for that thread, but since the stat
5839      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5840      the calling thread is cleaning up itself instead of another thread, it get
5841      confused. This happens because allowing a thread to unregister and cleanup
5842      another thread is a recent modification for addressing an issue.
5843      Based on the current design (20050722), a thread may end up
5844      trying to unregister another thread only if thread death does not trigger
5845      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5846      thread specific data destructor function to detect thread death. For
5847      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5848      is nothing.  Thus, the workaround is applicable only for Windows static
5849      stat library. */
5850   __kmp_internal_end_library(-1);
5851 #if KMP_OS_WINDOWS
5852   __kmp_close_console();
5853 #endif
5854 }
5855 
5856 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5857   // It is assumed __kmp_forkjoin_lock is acquired.
5858 
5859   int gtid;
5860 
5861   KMP_DEBUG_ASSERT(thread != NULL);
5862 
5863   gtid = thread->th.th_info.ds.ds_gtid;
5864 
5865   if (!is_root) {
5866     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5867       /* Assume the threads are at the fork barrier here */
5868       KA_TRACE(
5869           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5870                gtid));
5871       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5872        * (GEH) */
5873       ANNOTATE_HAPPENS_BEFORE(thread);
5874       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5875       __kmp_release_64(&flag);
5876     }
5877 
5878     // Terminate OS thread.
5879     __kmp_reap_worker(thread);
5880 
5881     // The thread was killed asynchronously.  If it was actively
5882     // spinning in the thread pool, decrement the global count.
5883     //
5884     // There is a small timing hole here - if the worker thread was just waking
5885     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5886     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5887     // the global counter might not get updated.
5888     //
5889     // Currently, this can only happen as the library is unloaded,
5890     // so there are no harmful side effects.
5891     if (thread->th.th_active_in_pool) {
5892       thread->th.th_active_in_pool = FALSE;
5893       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5894       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5895     }
5896 
5897     // Decrement # of [worker] threads in the pool.
5898     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5899     --__kmp_thread_pool_nth;
5900   }
5901 
5902   __kmp_free_implicit_task(thread);
5903 
5904 // Free the fast memory for tasking
5905 #if USE_FAST_MEMORY
5906   __kmp_free_fast_memory(thread);
5907 #endif /* USE_FAST_MEMORY */
5908 
5909   __kmp_suspend_uninitialize_thread(thread);
5910 
5911   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5912   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5913 
5914   --__kmp_all_nth;
5915 // __kmp_nth was decremented when thread is added to the pool.
5916 
5917 #ifdef KMP_ADJUST_BLOCKTIME
5918   /* Adjust blocktime back to user setting or default if necessary */
5919   /* Middle initialization might never have occurred                */
5920   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5921     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5922     if (__kmp_nth <= __kmp_avail_proc) {
5923       __kmp_zero_bt = FALSE;
5924     }
5925   }
5926 #endif /* KMP_ADJUST_BLOCKTIME */
5927 
5928   /* free the memory being used */
5929   if (__kmp_env_consistency_check) {
5930     if (thread->th.th_cons) {
5931       __kmp_free_cons_stack(thread->th.th_cons);
5932       thread->th.th_cons = NULL;
5933     }
5934   }
5935 
5936   if (thread->th.th_pri_common != NULL) {
5937     __kmp_free(thread->th.th_pri_common);
5938     thread->th.th_pri_common = NULL;
5939   }
5940 
5941   if (thread->th.th_task_state_memo_stack != NULL) {
5942     __kmp_free(thread->th.th_task_state_memo_stack);
5943     thread->th.th_task_state_memo_stack = NULL;
5944   }
5945 
5946 #if KMP_USE_BGET
5947   if (thread->th.th_local.bget_data != NULL) {
5948     __kmp_finalize_bget(thread);
5949   }
5950 #endif
5951 
5952 #if KMP_AFFINITY_SUPPORTED
5953   if (thread->th.th_affin_mask != NULL) {
5954     KMP_CPU_FREE(thread->th.th_affin_mask);
5955     thread->th.th_affin_mask = NULL;
5956   }
5957 #endif /* KMP_AFFINITY_SUPPORTED */
5958 
5959 #if KMP_USE_HIER_SCHED
5960   if (thread->th.th_hier_bar_data != NULL) {
5961     __kmp_free(thread->th.th_hier_bar_data);
5962     thread->th.th_hier_bar_data = NULL;
5963   }
5964 #endif
5965 
5966   __kmp_reap_team(thread->th.th_serial_team);
5967   thread->th.th_serial_team = NULL;
5968   __kmp_free(thread);
5969 
5970   KMP_MB();
5971 
5972 } // __kmp_reap_thread
5973 
5974 static void __kmp_internal_end(void) {
5975   int i;
5976 
5977   /* First, unregister the library */
5978   __kmp_unregister_library();
5979 
5980 #if KMP_OS_WINDOWS
5981   /* In Win static library, we can't tell when a root actually dies, so we
5982      reclaim the data structures for any root threads that have died but not
5983      unregistered themselves, in order to shut down cleanly.
5984      In Win dynamic library we also can't tell when a thread dies.  */
5985   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5986 // dead roots
5987 #endif
5988 
5989   for (i = 0; i < __kmp_threads_capacity; i++)
5990     if (__kmp_root[i])
5991       if (__kmp_root[i]->r.r_active)
5992         break;
5993   KMP_MB(); /* Flush all pending memory write invalidates.  */
5994   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5995 
5996   if (i < __kmp_threads_capacity) {
5997 #if KMP_USE_MONITOR
5998     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5999     KMP_MB(); /* Flush all pending memory write invalidates.  */
6000 
6001     // Need to check that monitor was initialized before reaping it. If we are
6002     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6003     // __kmp_monitor will appear to contain valid data, but it is only valid in
6004     // the parent process, not the child.
6005     // New behavior (201008): instead of keying off of the flag
6006     // __kmp_init_parallel, the monitor thread creation is keyed off
6007     // of the new flag __kmp_init_monitor.
6008     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6009     if (TCR_4(__kmp_init_monitor)) {
6010       __kmp_reap_monitor(&__kmp_monitor);
6011       TCW_4(__kmp_init_monitor, 0);
6012     }
6013     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6014     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6015 #endif // KMP_USE_MONITOR
6016   } else {
6017 /* TODO move this to cleanup code */
6018 #ifdef KMP_DEBUG
6019     /* make sure that everything has properly ended */
6020     for (i = 0; i < __kmp_threads_capacity; i++) {
6021       if (__kmp_root[i]) {
6022         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6023         //                    there can be uber threads alive here
6024         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6025       }
6026     }
6027 #endif
6028 
6029     KMP_MB();
6030 
6031     // Reap the worker threads.
6032     // This is valid for now, but be careful if threads are reaped sooner.
6033     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6034       // Get the next thread from the pool.
6035       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6036       __kmp_thread_pool = thread->th.th_next_pool;
6037       // Reap it.
6038       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6039       thread->th.th_next_pool = NULL;
6040       thread->th.th_in_pool = FALSE;
6041       __kmp_reap_thread(thread, 0);
6042     }
6043     __kmp_thread_pool_insert_pt = NULL;
6044 
6045     // Reap teams.
6046     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6047       // Get the next team from the pool.
6048       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6049       __kmp_team_pool = team->t.t_next_pool;
6050       // Reap it.
6051       team->t.t_next_pool = NULL;
6052       __kmp_reap_team(team);
6053     }
6054 
6055     __kmp_reap_task_teams();
6056 
6057 #if KMP_OS_UNIX
6058     // Threads that are not reaped should not access any resources since they
6059     // are going to be deallocated soon, so the shutdown sequence should wait
6060     // until all threads either exit the final spin-waiting loop or begin
6061     // sleeping after the given blocktime.
6062     for (i = 0; i < __kmp_threads_capacity; i++) {
6063       kmp_info_t *thr = __kmp_threads[i];
6064       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6065         KMP_CPU_PAUSE();
6066     }
6067 #endif
6068 
6069     for (i = 0; i < __kmp_threads_capacity; ++i) {
6070       // TBD: Add some checking...
6071       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6072     }
6073 
6074     /* Make sure all threadprivate destructors get run by joining with all
6075        worker threads before resetting this flag */
6076     TCW_SYNC_4(__kmp_init_common, FALSE);
6077 
6078     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6079     KMP_MB();
6080 
6081 #if KMP_USE_MONITOR
6082     // See note above: One of the possible fixes for CQ138434 / CQ140126
6083     //
6084     // FIXME: push both code fragments down and CSE them?
6085     // push them into __kmp_cleanup() ?
6086     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6087     if (TCR_4(__kmp_init_monitor)) {
6088       __kmp_reap_monitor(&__kmp_monitor);
6089       TCW_4(__kmp_init_monitor, 0);
6090     }
6091     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6092     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6093 #endif
6094   } /* else !__kmp_global.t_active */
6095   TCW_4(__kmp_init_gtid, FALSE);
6096   KMP_MB(); /* Flush all pending memory write invalidates.  */
6097 
6098   __kmp_cleanup();
6099 #if OMPT_SUPPORT
6100   ompt_fini();
6101 #endif
6102 }
6103 
6104 void __kmp_internal_end_library(int gtid_req) {
6105   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6106   /* this shouldn't be a race condition because __kmp_internal_end() is the
6107      only place to clear __kmp_serial_init */
6108   /* we'll check this later too, after we get the lock */
6109   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6110   // redundaant, because the next check will work in any case.
6111   if (__kmp_global.g.g_abort) {
6112     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6113     /* TODO abort? */
6114     return;
6115   }
6116   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6117     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6118     return;
6119   }
6120 
6121   KMP_MB(); /* Flush all pending memory write invalidates.  */
6122 
6123   /* find out who we are and what we should do */
6124   {
6125     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6126     KA_TRACE(
6127         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6128     if (gtid == KMP_GTID_SHUTDOWN) {
6129       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6130                     "already shutdown\n"));
6131       return;
6132     } else if (gtid == KMP_GTID_MONITOR) {
6133       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6134                     "registered, or system shutdown\n"));
6135       return;
6136     } else if (gtid == KMP_GTID_DNE) {
6137       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6138                     "shutdown\n"));
6139       /* we don't know who we are, but we may still shutdown the library */
6140     } else if (KMP_UBER_GTID(gtid)) {
6141       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6142       if (__kmp_root[gtid]->r.r_active) {
6143         __kmp_global.g.g_abort = -1;
6144         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6145         KA_TRACE(10,
6146                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6147                   gtid));
6148         return;
6149       } else {
6150         KA_TRACE(
6151             10,
6152             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6153         __kmp_unregister_root_current_thread(gtid);
6154       }
6155     } else {
6156 /* worker threads may call this function through the atexit handler, if they
6157  * call exit() */
6158 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6159    TODO: do a thorough shutdown instead */
6160 #ifdef DUMP_DEBUG_ON_EXIT
6161       if (__kmp_debug_buf)
6162         __kmp_dump_debug_buffer();
6163 #endif
6164       return;
6165     }
6166   }
6167   /* synchronize the termination process */
6168   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6169 
6170   /* have we already finished */
6171   if (__kmp_global.g.g_abort) {
6172     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6173     /* TODO abort? */
6174     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6175     return;
6176   }
6177   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6178     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6179     return;
6180   }
6181 
6182   /* We need this lock to enforce mutex between this reading of
6183      __kmp_threads_capacity and the writing by __kmp_register_root.
6184      Alternatively, we can use a counter of roots that is atomically updated by
6185      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6186      __kmp_internal_end_*.  */
6187   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6188 
6189   /* now we can safely conduct the actual termination */
6190   __kmp_internal_end();
6191 
6192   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6193   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6194 
6195   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6196 
6197 #ifdef DUMP_DEBUG_ON_EXIT
6198   if (__kmp_debug_buf)
6199     __kmp_dump_debug_buffer();
6200 #endif
6201 
6202 #if KMP_OS_WINDOWS
6203   __kmp_close_console();
6204 #endif
6205 
6206   __kmp_fini_allocator();
6207 
6208 } // __kmp_internal_end_library
6209 
6210 void __kmp_internal_end_thread(int gtid_req) {
6211   int i;
6212 
6213   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6214   /* this shouldn't be a race condition because __kmp_internal_end() is the
6215    * only place to clear __kmp_serial_init */
6216   /* we'll check this later too, after we get the lock */
6217   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6218   // redundant, because the next check will work in any case.
6219   if (__kmp_global.g.g_abort) {
6220     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6221     /* TODO abort? */
6222     return;
6223   }
6224   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6225     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6226     return;
6227   }
6228 
6229   KMP_MB(); /* Flush all pending memory write invalidates.  */
6230 
6231   /* find out who we are and what we should do */
6232   {
6233     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6234     KA_TRACE(10,
6235              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6236     if (gtid == KMP_GTID_SHUTDOWN) {
6237       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6238                     "already shutdown\n"));
6239       return;
6240     } else if (gtid == KMP_GTID_MONITOR) {
6241       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6242                     "registered, or system shutdown\n"));
6243       return;
6244     } else if (gtid == KMP_GTID_DNE) {
6245       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6246                     "shutdown\n"));
6247       return;
6248       /* we don't know who we are */
6249     } else if (KMP_UBER_GTID(gtid)) {
6250       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6251       if (__kmp_root[gtid]->r.r_active) {
6252         __kmp_global.g.g_abort = -1;
6253         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6254         KA_TRACE(10,
6255                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6256                   gtid));
6257         return;
6258       } else {
6259         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6260                       gtid));
6261         __kmp_unregister_root_current_thread(gtid);
6262       }
6263     } else {
6264       /* just a worker thread, let's leave */
6265       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6266 
6267       if (gtid >= 0) {
6268         __kmp_threads[gtid]->th.th_task_team = NULL;
6269       }
6270 
6271       KA_TRACE(10,
6272                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6273                 gtid));
6274       return;
6275     }
6276   }
6277 #if KMP_DYNAMIC_LIB
6278   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6279   // thread, because we will better shutdown later in the library destructor.
6280   // The reason of this change is performance problem when non-openmp thread in
6281   // a loop forks and joins many openmp threads. We can save a lot of time
6282   // keeping worker threads alive until the program shutdown.
6283   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6284   // and Windows(DPD200287443) that occurs when using critical sections from
6285   // foreign threads.
6286   if (__kmp_pause_status != kmp_hard_paused) {
6287     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6288     return;
6289   }
6290 #endif
6291   /* synchronize the termination process */
6292   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6293 
6294   /* have we already finished */
6295   if (__kmp_global.g.g_abort) {
6296     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6297     /* TODO abort? */
6298     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6299     return;
6300   }
6301   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6302     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6303     return;
6304   }
6305 
6306   /* We need this lock to enforce mutex between this reading of
6307      __kmp_threads_capacity and the writing by __kmp_register_root.
6308      Alternatively, we can use a counter of roots that is atomically updated by
6309      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6310      __kmp_internal_end_*.  */
6311 
6312   /* should we finish the run-time?  are all siblings done? */
6313   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6314 
6315   for (i = 0; i < __kmp_threads_capacity; ++i) {
6316     if (KMP_UBER_GTID(i)) {
6317       KA_TRACE(
6318           10,
6319           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6320       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6321       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6322       return;
6323     }
6324   }
6325 
6326   /* now we can safely conduct the actual termination */
6327 
6328   __kmp_internal_end();
6329 
6330   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6331   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6332 
6333   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6334 
6335 #ifdef DUMP_DEBUG_ON_EXIT
6336   if (__kmp_debug_buf)
6337     __kmp_dump_debug_buffer();
6338 #endif
6339 } // __kmp_internal_end_thread
6340 
6341 // -----------------------------------------------------------------------------
6342 // Library registration stuff.
6343 
6344 static long __kmp_registration_flag = 0;
6345 // Random value used to indicate library initialization.
6346 static char *__kmp_registration_str = NULL;
6347 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6348 
6349 static inline char *__kmp_reg_status_name() {
6350   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6351      each thread. If registration and unregistration go in different threads
6352      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6353      env var can not be found, because the name will contain different pid. */
6354   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6355 } // __kmp_reg_status_get
6356 
6357 void __kmp_register_library_startup(void) {
6358 
6359   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6360   int done = 0;
6361   union {
6362     double dtime;
6363     long ltime;
6364   } time;
6365 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6366   __kmp_initialize_system_tick();
6367 #endif
6368   __kmp_read_system_time(&time.dtime);
6369   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6370   __kmp_registration_str =
6371       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6372                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6373 
6374   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6375                 __kmp_registration_str));
6376 
6377   while (!done) {
6378 
6379     char *value = NULL; // Actual value of the environment variable.
6380 
6381     // Set environment variable, but do not overwrite if it is exist.
6382     __kmp_env_set(name, __kmp_registration_str, 0);
6383     // Check the variable is written.
6384     value = __kmp_env_get(name);
6385     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6386 
6387       done = 1; // Ok, environment variable set successfully, exit the loop.
6388 
6389     } else {
6390 
6391       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6392       // Check whether it alive or dead.
6393       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6394       char *tail = value;
6395       char *flag_addr_str = NULL;
6396       char *flag_val_str = NULL;
6397       char const *file_name = NULL;
6398       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6399       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6400       file_name = tail;
6401       if (tail != NULL) {
6402         long *flag_addr = 0;
6403         long flag_val = 0;
6404         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6405         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6406         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6407           // First, check whether environment-encoded address is mapped into
6408           // addr space.
6409           // If so, dereference it to see if it still has the right value.
6410           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6411             neighbor = 1;
6412           } else {
6413             // If not, then we know the other copy of the library is no longer
6414             // running.
6415             neighbor = 2;
6416           }
6417         }
6418       }
6419       switch (neighbor) {
6420       case 0: // Cannot parse environment variable -- neighbor status unknown.
6421         // Assume it is the incompatible format of future version of the
6422         // library. Assume the other library is alive.
6423         // WARN( ... ); // TODO: Issue a warning.
6424         file_name = "unknown library";
6425       // Attention! Falling to the next case. That's intentional.
6426       case 1: { // Neighbor is alive.
6427         // Check it is allowed.
6428         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6429         if (!__kmp_str_match_true(duplicate_ok)) {
6430           // That's not allowed. Issue fatal error.
6431           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6432                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6433         }
6434         KMP_INTERNAL_FREE(duplicate_ok);
6435         __kmp_duplicate_library_ok = 1;
6436         done = 1; // Exit the loop.
6437       } break;
6438       case 2: { // Neighbor is dead.
6439         // Clear the variable and try to register library again.
6440         __kmp_env_unset(name);
6441       } break;
6442       default: { KMP_DEBUG_ASSERT(0); } break;
6443       }
6444     }
6445     KMP_INTERNAL_FREE((void *)value);
6446   }
6447   KMP_INTERNAL_FREE((void *)name);
6448 
6449 } // func __kmp_register_library_startup
6450 
6451 void __kmp_unregister_library(void) {
6452 
6453   char *name = __kmp_reg_status_name();
6454   char *value = __kmp_env_get(name);
6455 
6456   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6457   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6458   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6459     // Ok, this is our variable. Delete it.
6460     __kmp_env_unset(name);
6461   }
6462 
6463   KMP_INTERNAL_FREE(__kmp_registration_str);
6464   KMP_INTERNAL_FREE(value);
6465   KMP_INTERNAL_FREE(name);
6466 
6467   __kmp_registration_flag = 0;
6468   __kmp_registration_str = NULL;
6469 
6470 } // __kmp_unregister_library
6471 
6472 // End of Library registration stuff.
6473 // -----------------------------------------------------------------------------
6474 
6475 #if KMP_MIC_SUPPORTED
6476 
6477 static void __kmp_check_mic_type() {
6478   kmp_cpuid_t cpuid_state = {0};
6479   kmp_cpuid_t *cs_p = &cpuid_state;
6480   __kmp_x86_cpuid(1, 0, cs_p);
6481   // We don't support mic1 at the moment
6482   if ((cs_p->eax & 0xff0) == 0xB10) {
6483     __kmp_mic_type = mic2;
6484   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6485     __kmp_mic_type = mic3;
6486   } else {
6487     __kmp_mic_type = non_mic;
6488   }
6489 }
6490 
6491 #endif /* KMP_MIC_SUPPORTED */
6492 
6493 static void __kmp_do_serial_initialize(void) {
6494   int i, gtid;
6495   int size;
6496 
6497   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6498 
6499   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6500   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6501   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6502   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6503   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6504 
6505 #if OMPT_SUPPORT
6506   ompt_pre_init();
6507 #endif
6508 
6509   __kmp_validate_locks();
6510 
6511   /* Initialize internal memory allocator */
6512   __kmp_init_allocator();
6513 
6514   /* Register the library startup via an environment variable and check to see
6515      whether another copy of the library is already registered. */
6516 
6517   __kmp_register_library_startup();
6518 
6519   /* TODO reinitialization of library */
6520   if (TCR_4(__kmp_global.g.g_done)) {
6521     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6522   }
6523 
6524   __kmp_global.g.g_abort = 0;
6525   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6526 
6527 /* initialize the locks */
6528 #if KMP_USE_ADAPTIVE_LOCKS
6529 #if KMP_DEBUG_ADAPTIVE_LOCKS
6530   __kmp_init_speculative_stats();
6531 #endif
6532 #endif
6533 #if KMP_STATS_ENABLED
6534   __kmp_stats_init();
6535 #endif
6536   __kmp_init_lock(&__kmp_global_lock);
6537   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6538   __kmp_init_lock(&__kmp_debug_lock);
6539   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6540   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6541   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6542   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6543   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6544   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6545   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6546   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6547   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6548   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6549   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6550   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6551   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6552   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6553   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6554 #if KMP_USE_MONITOR
6555   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6556 #endif
6557   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6558 
6559   /* conduct initialization and initial setup of configuration */
6560 
6561   __kmp_runtime_initialize();
6562 
6563 #if KMP_MIC_SUPPORTED
6564   __kmp_check_mic_type();
6565 #endif
6566 
6567 // Some global variable initialization moved here from kmp_env_initialize()
6568 #ifdef KMP_DEBUG
6569   kmp_diag = 0;
6570 #endif
6571   __kmp_abort_delay = 0;
6572 
6573   // From __kmp_init_dflt_team_nth()
6574   /* assume the entire machine will be used */
6575   __kmp_dflt_team_nth_ub = __kmp_xproc;
6576   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6577     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6578   }
6579   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6580     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6581   }
6582   __kmp_max_nth = __kmp_sys_max_nth;
6583   __kmp_cg_max_nth = __kmp_sys_max_nth;
6584   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6585   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6586     __kmp_teams_max_nth = __kmp_sys_max_nth;
6587   }
6588 
6589   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6590   // part
6591   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6592 #if KMP_USE_MONITOR
6593   __kmp_monitor_wakeups =
6594       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6595   __kmp_bt_intervals =
6596       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6597 #endif
6598   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6599   __kmp_library = library_throughput;
6600   // From KMP_SCHEDULE initialization
6601   __kmp_static = kmp_sch_static_balanced;
6602 // AC: do not use analytical here, because it is non-monotonous
6603 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6604 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6605 // need to repeat assignment
6606 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6607 // bit control and barrier method control parts
6608 #if KMP_FAST_REDUCTION_BARRIER
6609 #define kmp_reduction_barrier_gather_bb ((int)1)
6610 #define kmp_reduction_barrier_release_bb ((int)1)
6611 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6612 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6613 #endif // KMP_FAST_REDUCTION_BARRIER
6614   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6615     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6616     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6617     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6618     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6619 #if KMP_FAST_REDUCTION_BARRIER
6620     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6621       // lin_64 ): hyper,1
6622       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6623       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6624       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6625       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6626     }
6627 #endif // KMP_FAST_REDUCTION_BARRIER
6628   }
6629 #if KMP_FAST_REDUCTION_BARRIER
6630 #undef kmp_reduction_barrier_release_pat
6631 #undef kmp_reduction_barrier_gather_pat
6632 #undef kmp_reduction_barrier_release_bb
6633 #undef kmp_reduction_barrier_gather_bb
6634 #endif // KMP_FAST_REDUCTION_BARRIER
6635 #if KMP_MIC_SUPPORTED
6636   if (__kmp_mic_type == mic2) { // KNC
6637     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6638     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6639     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6640         1; // forkjoin release
6641     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6642     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6643   }
6644 #if KMP_FAST_REDUCTION_BARRIER
6645   if (__kmp_mic_type == mic2) { // KNC
6646     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6647     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6648   }
6649 #endif // KMP_FAST_REDUCTION_BARRIER
6650 #endif // KMP_MIC_SUPPORTED
6651 
6652 // From KMP_CHECKS initialization
6653 #ifdef KMP_DEBUG
6654   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6655 #else
6656   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6657 #endif
6658 
6659   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6660   __kmp_foreign_tp = TRUE;
6661 
6662   __kmp_global.g.g_dynamic = FALSE;
6663   __kmp_global.g.g_dynamic_mode = dynamic_default;
6664 
6665   __kmp_env_initialize(NULL);
6666 
6667 // Print all messages in message catalog for testing purposes.
6668 #ifdef KMP_DEBUG
6669   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6670   if (__kmp_str_match_true(val)) {
6671     kmp_str_buf_t buffer;
6672     __kmp_str_buf_init(&buffer);
6673     __kmp_i18n_dump_catalog(&buffer);
6674     __kmp_printf("%s", buffer.str);
6675     __kmp_str_buf_free(&buffer);
6676   }
6677   __kmp_env_free(&val);
6678 #endif
6679 
6680   __kmp_threads_capacity =
6681       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6682   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6683   __kmp_tp_capacity = __kmp_default_tp_capacity(
6684       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6685 
6686   // If the library is shut down properly, both pools must be NULL. Just in
6687   // case, set them to NULL -- some memory may leak, but subsequent code will
6688   // work even if pools are not freed.
6689   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6690   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6691   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6692   __kmp_thread_pool = NULL;
6693   __kmp_thread_pool_insert_pt = NULL;
6694   __kmp_team_pool = NULL;
6695 
6696   /* Allocate all of the variable sized records */
6697   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6698    * expandable */
6699   /* Since allocation is cache-aligned, just add extra padding at the end */
6700   size =
6701       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6702       CACHE_LINE;
6703   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6704   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6705                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6706 
6707   /* init thread counts */
6708   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6709                    0); // Asserts fail if the library is reinitializing and
6710   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6711   __kmp_all_nth = 0;
6712   __kmp_nth = 0;
6713 
6714   /* setup the uber master thread and hierarchy */
6715   gtid = __kmp_register_root(TRUE);
6716   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6717   KMP_ASSERT(KMP_UBER_GTID(gtid));
6718   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6719 
6720   KMP_MB(); /* Flush all pending memory write invalidates.  */
6721 
6722   __kmp_common_initialize();
6723 
6724 #if KMP_OS_UNIX
6725   /* invoke the child fork handler */
6726   __kmp_register_atfork();
6727 #endif
6728 
6729 #if !KMP_DYNAMIC_LIB
6730   {
6731     /* Invoke the exit handler when the program finishes, only for static
6732        library. For dynamic library, we already have _fini and DllMain. */
6733     int rc = atexit(__kmp_internal_end_atexit);
6734     if (rc != 0) {
6735       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6736                   __kmp_msg_null);
6737     }
6738   }
6739 #endif
6740 
6741 #if KMP_HANDLE_SIGNALS
6742 #if KMP_OS_UNIX
6743   /* NOTE: make sure that this is called before the user installs their own
6744      signal handlers so that the user handlers are called first. this way they
6745      can return false, not call our handler, avoid terminating the library, and
6746      continue execution where they left off. */
6747   __kmp_install_signals(FALSE);
6748 #endif /* KMP_OS_UNIX */
6749 #if KMP_OS_WINDOWS
6750   __kmp_install_signals(TRUE);
6751 #endif /* KMP_OS_WINDOWS */
6752 #endif
6753 
6754   /* we have finished the serial initialization */
6755   __kmp_init_counter++;
6756 
6757   __kmp_init_serial = TRUE;
6758 
6759   if (__kmp_settings) {
6760     __kmp_env_print();
6761   }
6762 
6763 #if OMP_40_ENABLED
6764   if (__kmp_display_env || __kmp_display_env_verbose) {
6765     __kmp_env_print_2();
6766   }
6767 #endif // OMP_40_ENABLED
6768 
6769 #if OMPT_SUPPORT
6770   ompt_post_init();
6771 #endif
6772 
6773   KMP_MB();
6774 
6775   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6776 }
6777 
6778 void __kmp_serial_initialize(void) {
6779   if (__kmp_init_serial) {
6780     return;
6781   }
6782   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6783   if (__kmp_init_serial) {
6784     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6785     return;
6786   }
6787   __kmp_do_serial_initialize();
6788   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6789 }
6790 
6791 static void __kmp_do_middle_initialize(void) {
6792   int i, j;
6793   int prev_dflt_team_nth;
6794 
6795   if (!__kmp_init_serial) {
6796     __kmp_do_serial_initialize();
6797   }
6798 
6799   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6800 
6801   // Save the previous value for the __kmp_dflt_team_nth so that
6802   // we can avoid some reinitialization if it hasn't changed.
6803   prev_dflt_team_nth = __kmp_dflt_team_nth;
6804 
6805 #if KMP_AFFINITY_SUPPORTED
6806   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6807   // number of cores on the machine.
6808   __kmp_affinity_initialize();
6809 
6810   // Run through the __kmp_threads array and set the affinity mask
6811   // for each root thread that is currently registered with the RTL.
6812   for (i = 0; i < __kmp_threads_capacity; i++) {
6813     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6814       __kmp_affinity_set_init_mask(i, TRUE);
6815     }
6816   }
6817 #endif /* KMP_AFFINITY_SUPPORTED */
6818 
6819   KMP_ASSERT(__kmp_xproc > 0);
6820   if (__kmp_avail_proc == 0) {
6821     __kmp_avail_proc = __kmp_xproc;
6822   }
6823 
6824   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6825   // correct them now
6826   j = 0;
6827   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6828     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6829         __kmp_avail_proc;
6830     j++;
6831   }
6832 
6833   if (__kmp_dflt_team_nth == 0) {
6834 #ifdef KMP_DFLT_NTH_CORES
6835     // Default #threads = #cores
6836     __kmp_dflt_team_nth = __kmp_ncores;
6837     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6838                   "__kmp_ncores (%d)\n",
6839                   __kmp_dflt_team_nth));
6840 #else
6841     // Default #threads = #available OS procs
6842     __kmp_dflt_team_nth = __kmp_avail_proc;
6843     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6844                   "__kmp_avail_proc(%d)\n",
6845                   __kmp_dflt_team_nth));
6846 #endif /* KMP_DFLT_NTH_CORES */
6847   }
6848 
6849   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6850     __kmp_dflt_team_nth = KMP_MIN_NTH;
6851   }
6852   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6853     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6854   }
6855 
6856   // There's no harm in continuing if the following check fails,
6857   // but it indicates an error in the previous logic.
6858   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6859 
6860   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6861     // Run through the __kmp_threads array and set the num threads icv for each
6862     // root thread that is currently registered with the RTL (which has not
6863     // already explicitly set its nthreads-var with a call to
6864     // omp_set_num_threads()).
6865     for (i = 0; i < __kmp_threads_capacity; i++) {
6866       kmp_info_t *thread = __kmp_threads[i];
6867       if (thread == NULL)
6868         continue;
6869       if (thread->th.th_current_task->td_icvs.nproc != 0)
6870         continue;
6871 
6872       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6873     }
6874   }
6875   KA_TRACE(
6876       20,
6877       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6878        __kmp_dflt_team_nth));
6879 
6880 #ifdef KMP_ADJUST_BLOCKTIME
6881   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6882   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6883     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6884     if (__kmp_nth > __kmp_avail_proc) {
6885       __kmp_zero_bt = TRUE;
6886     }
6887   }
6888 #endif /* KMP_ADJUST_BLOCKTIME */
6889 
6890   /* we have finished middle initialization */
6891   TCW_SYNC_4(__kmp_init_middle, TRUE);
6892 
6893   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6894 }
6895 
6896 void __kmp_middle_initialize(void) {
6897   if (__kmp_init_middle) {
6898     return;
6899   }
6900   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6901   if (__kmp_init_middle) {
6902     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6903     return;
6904   }
6905   __kmp_do_middle_initialize();
6906   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6907 }
6908 
6909 void __kmp_parallel_initialize(void) {
6910   int gtid = __kmp_entry_gtid(); // this might be a new root
6911 
6912   /* synchronize parallel initialization (for sibling) */
6913   if (TCR_4(__kmp_init_parallel))
6914     return;
6915   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6916   if (TCR_4(__kmp_init_parallel)) {
6917     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6918     return;
6919   }
6920 
6921   /* TODO reinitialization after we have already shut down */
6922   if (TCR_4(__kmp_global.g.g_done)) {
6923     KA_TRACE(
6924         10,
6925         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6926     __kmp_infinite_loop();
6927   }
6928 
6929   /* jc: The lock __kmp_initz_lock is already held, so calling
6930      __kmp_serial_initialize would cause a deadlock.  So we call
6931      __kmp_do_serial_initialize directly. */
6932   if (!__kmp_init_middle) {
6933     __kmp_do_middle_initialize();
6934   }
6935 
6936 #if OMP_50_ENABLED
6937   __kmp_resume_if_hard_paused();
6938 #endif
6939 
6940   /* begin initialization */
6941   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6942   KMP_ASSERT(KMP_UBER_GTID(gtid));
6943 
6944 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6945   // Save the FP control regs.
6946   // Worker threads will set theirs to these values at thread startup.
6947   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6948   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6949   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6950 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6951 
6952 #if KMP_OS_UNIX
6953 #if KMP_HANDLE_SIGNALS
6954   /*  must be after __kmp_serial_initialize  */
6955   __kmp_install_signals(TRUE);
6956 #endif
6957 #endif
6958 
6959   __kmp_suspend_initialize();
6960 
6961 #if defined(USE_LOAD_BALANCE)
6962   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6963     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6964   }
6965 #else
6966   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6967     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6968   }
6969 #endif
6970 
6971   if (__kmp_version) {
6972     __kmp_print_version_2();
6973   }
6974 
6975   /* we have finished parallel initialization */
6976   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6977 
6978   KMP_MB();
6979   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6980 
6981   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6982 }
6983 
6984 /* ------------------------------------------------------------------------ */
6985 
6986 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6987                                    kmp_team_t *team) {
6988   kmp_disp_t *dispatch;
6989 
6990   KMP_MB();
6991 
6992   /* none of the threads have encountered any constructs, yet. */
6993   this_thr->th.th_local.this_construct = 0;
6994 #if KMP_CACHE_MANAGE
6995   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6996 #endif /* KMP_CACHE_MANAGE */
6997   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6998   KMP_DEBUG_ASSERT(dispatch);
6999   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7000   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7001   // this_thr->th.th_info.ds.ds_tid ] );
7002 
7003   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7004 #if OMP_45_ENABLED
7005   dispatch->th_doacross_buf_idx =
7006       0; /* reset the doacross dispatch buffer counter */
7007 #endif
7008   if (__kmp_env_consistency_check)
7009     __kmp_push_parallel(gtid, team->t.t_ident);
7010 
7011   KMP_MB(); /* Flush all pending memory write invalidates.  */
7012 }
7013 
7014 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7015                                   kmp_team_t *team) {
7016   if (__kmp_env_consistency_check)
7017     __kmp_pop_parallel(gtid, team->t.t_ident);
7018 
7019   __kmp_finish_implicit_task(this_thr);
7020 }
7021 
7022 int __kmp_invoke_task_func(int gtid) {
7023   int rc;
7024   int tid = __kmp_tid_from_gtid(gtid);
7025   kmp_info_t *this_thr = __kmp_threads[gtid];
7026   kmp_team_t *team = this_thr->th.th_team;
7027 
7028   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7029 #if USE_ITT_BUILD
7030   if (__itt_stack_caller_create_ptr) {
7031     __kmp_itt_stack_callee_enter(
7032         (__itt_caller)
7033             team->t.t_stack_id); // inform ittnotify about entering user's code
7034   }
7035 #endif /* USE_ITT_BUILD */
7036 #if INCLUDE_SSC_MARKS
7037   SSC_MARK_INVOKING();
7038 #endif
7039 
7040 #if OMPT_SUPPORT
7041   void *dummy;
7042   void **exit_runtime_p;
7043   ompt_data_t *my_task_data;
7044   ompt_data_t *my_parallel_data;
7045   int ompt_team_size;
7046 
7047   if (ompt_enabled.enabled) {
7048     exit_runtime_p = &(
7049         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7050   } else {
7051     exit_runtime_p = &dummy;
7052   }
7053 
7054   my_task_data =
7055       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7056   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7057   if (ompt_enabled.ompt_callback_implicit_task) {
7058     ompt_team_size = team->t.t_nproc;
7059     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7060         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7061         __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7062     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7063   }
7064 #endif
7065 
7066   {
7067     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7068     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7069     rc =
7070         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7071                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
7072 #if OMPT_SUPPORT
7073                                ,
7074                                exit_runtime_p
7075 #endif
7076                                );
7077 #if OMPT_SUPPORT
7078     *exit_runtime_p = NULL;
7079 #endif
7080   }
7081 
7082 #if USE_ITT_BUILD
7083   if (__itt_stack_caller_create_ptr) {
7084     __kmp_itt_stack_callee_leave(
7085         (__itt_caller)
7086             team->t.t_stack_id); // inform ittnotify about leaving user's code
7087   }
7088 #endif /* USE_ITT_BUILD */
7089   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7090 
7091   return rc;
7092 }
7093 
7094 #if OMP_40_ENABLED
7095 void __kmp_teams_master(int gtid) {
7096   // This routine is called by all master threads in teams construct
7097   kmp_info_t *thr = __kmp_threads[gtid];
7098   kmp_team_t *team = thr->th.th_team;
7099   ident_t *loc = team->t.t_ident;
7100   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7101   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7102   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7103   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7104                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7105 // Launch league of teams now, but not let workers execute
7106 // (they hang on fork barrier until next parallel)
7107 #if INCLUDE_SSC_MARKS
7108   SSC_MARK_FORKING();
7109 #endif
7110   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7111                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7112                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7113 #if INCLUDE_SSC_MARKS
7114   SSC_MARK_JOINING();
7115 #endif
7116 
7117   // AC: last parameter "1" eliminates join barrier which won't work because
7118   // worker threads are in a fork barrier waiting for more parallel regions
7119   __kmp_join_call(loc, gtid
7120 #if OMPT_SUPPORT
7121                   ,
7122                   fork_context_intel
7123 #endif
7124                   ,
7125                   1);
7126 }
7127 
7128 int __kmp_invoke_teams_master(int gtid) {
7129   kmp_info_t *this_thr = __kmp_threads[gtid];
7130   kmp_team_t *team = this_thr->th.th_team;
7131 #if KMP_DEBUG
7132   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7133     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7134                      (void *)__kmp_teams_master);
7135 #endif
7136   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7137   __kmp_teams_master(gtid);
7138   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7139   return 1;
7140 }
7141 #endif /* OMP_40_ENABLED */
7142 
7143 /* this sets the requested number of threads for the next parallel region
7144    encountered by this team. since this should be enclosed in the forkjoin
7145    critical section it should avoid race conditions with assymmetrical nested
7146    parallelism */
7147 
7148 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7149   kmp_info_t *thr = __kmp_threads[gtid];
7150 
7151   if (num_threads > 0)
7152     thr->th.th_set_nproc = num_threads;
7153 }
7154 
7155 #if OMP_40_ENABLED
7156 
7157 /* this sets the requested number of teams for the teams region and/or
7158    the number of threads for the next parallel region encountered  */
7159 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7160                           int num_threads) {
7161   kmp_info_t *thr = __kmp_threads[gtid];
7162   KMP_DEBUG_ASSERT(num_teams >= 0);
7163   KMP_DEBUG_ASSERT(num_threads >= 0);
7164 
7165   if (num_teams == 0)
7166     num_teams = 1; // default number of teams is 1.
7167   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7168     if (!__kmp_reserve_warn) {
7169       __kmp_reserve_warn = 1;
7170       __kmp_msg(kmp_ms_warning,
7171                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7172                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7173     }
7174     num_teams = __kmp_teams_max_nth;
7175   }
7176   // Set number of teams (number of threads in the outer "parallel" of the
7177   // teams)
7178   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7179 
7180   // Remember the number of threads for inner parallel regions
7181   if (num_threads == 0) {
7182     if (!TCR_4(__kmp_init_middle))
7183       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7184     num_threads = __kmp_avail_proc / num_teams;
7185     if (num_teams * num_threads > __kmp_teams_max_nth) {
7186       // adjust num_threads w/o warning as it is not user setting
7187       num_threads = __kmp_teams_max_nth / num_teams;
7188     }
7189   } else {
7190     if (num_teams * num_threads > __kmp_teams_max_nth) {
7191       int new_threads = __kmp_teams_max_nth / num_teams;
7192       if (!__kmp_reserve_warn) { // user asked for too many threads
7193         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7194         __kmp_msg(kmp_ms_warning,
7195                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7196                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7197       }
7198       num_threads = new_threads;
7199     }
7200   }
7201   thr->th.th_teams_size.nth = num_threads;
7202 }
7203 
7204 // Set the proc_bind var to use in the following parallel region.
7205 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7206   kmp_info_t *thr = __kmp_threads[gtid];
7207   thr->th.th_set_proc_bind = proc_bind;
7208 }
7209 
7210 #endif /* OMP_40_ENABLED */
7211 
7212 /* Launch the worker threads into the microtask. */
7213 
7214 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7215   kmp_info_t *this_thr = __kmp_threads[gtid];
7216 
7217 #ifdef KMP_DEBUG
7218   int f;
7219 #endif /* KMP_DEBUG */
7220 
7221   KMP_DEBUG_ASSERT(team);
7222   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7223   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7224   KMP_MB(); /* Flush all pending memory write invalidates.  */
7225 
7226   team->t.t_construct = 0; /* no single directives seen yet */
7227   team->t.t_ordered.dt.t_value =
7228       0; /* thread 0 enters the ordered section first */
7229 
7230   /* Reset the identifiers on the dispatch buffer */
7231   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7232   if (team->t.t_max_nproc > 1) {
7233     int i;
7234     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7235       team->t.t_disp_buffer[i].buffer_index = i;
7236 #if OMP_45_ENABLED
7237       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7238 #endif
7239     }
7240   } else {
7241     team->t.t_disp_buffer[0].buffer_index = 0;
7242 #if OMP_45_ENABLED
7243     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7244 #endif
7245   }
7246 
7247   KMP_MB(); /* Flush all pending memory write invalidates.  */
7248   KMP_ASSERT(this_thr->th.th_team == team);
7249 
7250 #ifdef KMP_DEBUG
7251   for (f = 0; f < team->t.t_nproc; f++) {
7252     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7253                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7254   }
7255 #endif /* KMP_DEBUG */
7256 
7257   /* release the worker threads so they may begin working */
7258   __kmp_fork_barrier(gtid, 0);
7259 }
7260 
7261 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7262   kmp_info_t *this_thr = __kmp_threads[gtid];
7263 
7264   KMP_DEBUG_ASSERT(team);
7265   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7266   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7267   KMP_MB(); /* Flush all pending memory write invalidates.  */
7268 
7269 /* Join barrier after fork */
7270 
7271 #ifdef KMP_DEBUG
7272   if (__kmp_threads[gtid] &&
7273       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7274     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7275                  __kmp_threads[gtid]);
7276     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7277                  "team->t.t_nproc=%d\n",
7278                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7279                  team->t.t_nproc);
7280     __kmp_print_structure();
7281   }
7282   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7283                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7284 #endif /* KMP_DEBUG */
7285 
7286   __kmp_join_barrier(gtid); /* wait for everyone */
7287 #if OMPT_SUPPORT
7288   if (ompt_enabled.enabled &&
7289       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7290     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7291     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7292     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7293 #if OMPT_OPTIONAL
7294     void *codeptr = NULL;
7295     if (KMP_MASTER_TID(ds_tid) &&
7296         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7297          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7298       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7299 
7300     if (ompt_enabled.ompt_callback_sync_region_wait) {
7301       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7302           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7303     }
7304     if (ompt_enabled.ompt_callback_sync_region) {
7305       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7306           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7307     }
7308 #endif
7309     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7310       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7311           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7312     }
7313   }
7314 #endif
7315 
7316   KMP_MB(); /* Flush all pending memory write invalidates.  */
7317   KMP_ASSERT(this_thr->th.th_team == team);
7318 }
7319 
7320 /* ------------------------------------------------------------------------ */
7321 
7322 #ifdef USE_LOAD_BALANCE
7323 
7324 // Return the worker threads actively spinning in the hot team, if we
7325 // are at the outermost level of parallelism.  Otherwise, return 0.
7326 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7327   int i;
7328   int retval;
7329   kmp_team_t *hot_team;
7330 
7331   if (root->r.r_active) {
7332     return 0;
7333   }
7334   hot_team = root->r.r_hot_team;
7335   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7336     return hot_team->t.t_nproc - 1; // Don't count master thread
7337   }
7338 
7339   // Skip the master thread - it is accounted for elsewhere.
7340   retval = 0;
7341   for (i = 1; i < hot_team->t.t_nproc; i++) {
7342     if (hot_team->t.t_threads[i]->th.th_active) {
7343       retval++;
7344     }
7345   }
7346   return retval;
7347 }
7348 
7349 // Perform an automatic adjustment to the number of
7350 // threads used by the next parallel region.
7351 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7352   int retval;
7353   int pool_active;
7354   int hot_team_active;
7355   int team_curr_active;
7356   int system_active;
7357 
7358   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7359                 set_nproc));
7360   KMP_DEBUG_ASSERT(root);
7361   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7362                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7363   KMP_DEBUG_ASSERT(set_nproc > 1);
7364 
7365   if (set_nproc == 1) {
7366     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7367     return 1;
7368   }
7369 
7370   // Threads that are active in the thread pool, active in the hot team for this
7371   // particular root (if we are at the outer par level), and the currently
7372   // executing thread (to become the master) are available to add to the new
7373   // team, but are currently contributing to the system load, and must be
7374   // accounted for.
7375   pool_active = __kmp_thread_pool_active_nth;
7376   hot_team_active = __kmp_active_hot_team_nproc(root);
7377   team_curr_active = pool_active + hot_team_active + 1;
7378 
7379   // Check the system load.
7380   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7381   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7382                 "hot team active = %d\n",
7383                 system_active, pool_active, hot_team_active));
7384 
7385   if (system_active < 0) {
7386     // There was an error reading the necessary info from /proc, so use the
7387     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7388     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7389     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7390     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7391 
7392     // Make this call behave like the thread limit algorithm.
7393     retval = __kmp_avail_proc - __kmp_nth +
7394              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7395     if (retval > set_nproc) {
7396       retval = set_nproc;
7397     }
7398     if (retval < KMP_MIN_NTH) {
7399       retval = KMP_MIN_NTH;
7400     }
7401 
7402     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7403                   retval));
7404     return retval;
7405   }
7406 
7407   // There is a slight delay in the load balance algorithm in detecting new
7408   // running procs. The real system load at this instant should be at least as
7409   // large as the #active omp thread that are available to add to the team.
7410   if (system_active < team_curr_active) {
7411     system_active = team_curr_active;
7412   }
7413   retval = __kmp_avail_proc - system_active + team_curr_active;
7414   if (retval > set_nproc) {
7415     retval = set_nproc;
7416   }
7417   if (retval < KMP_MIN_NTH) {
7418     retval = KMP_MIN_NTH;
7419   }
7420 
7421   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7422   return retval;
7423 } // __kmp_load_balance_nproc()
7424 
7425 #endif /* USE_LOAD_BALANCE */
7426 
7427 /* ------------------------------------------------------------------------ */
7428 
7429 /* NOTE: this is called with the __kmp_init_lock held */
7430 void __kmp_cleanup(void) {
7431   int f;
7432 
7433   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7434 
7435   if (TCR_4(__kmp_init_parallel)) {
7436 #if KMP_HANDLE_SIGNALS
7437     __kmp_remove_signals();
7438 #endif
7439     TCW_4(__kmp_init_parallel, FALSE);
7440   }
7441 
7442   if (TCR_4(__kmp_init_middle)) {
7443 #if KMP_AFFINITY_SUPPORTED
7444     __kmp_affinity_uninitialize();
7445 #endif /* KMP_AFFINITY_SUPPORTED */
7446     __kmp_cleanup_hierarchy();
7447     TCW_4(__kmp_init_middle, FALSE);
7448   }
7449 
7450   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7451 
7452   if (__kmp_init_serial) {
7453     __kmp_runtime_destroy();
7454     __kmp_init_serial = FALSE;
7455   }
7456 
7457   __kmp_cleanup_threadprivate_caches();
7458 
7459   for (f = 0; f < __kmp_threads_capacity; f++) {
7460     if (__kmp_root[f] != NULL) {
7461       __kmp_free(__kmp_root[f]);
7462       __kmp_root[f] = NULL;
7463     }
7464   }
7465   __kmp_free(__kmp_threads);
7466   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7467   // there is no need in freeing __kmp_root.
7468   __kmp_threads = NULL;
7469   __kmp_root = NULL;
7470   __kmp_threads_capacity = 0;
7471 
7472 #if KMP_USE_DYNAMIC_LOCK
7473   __kmp_cleanup_indirect_user_locks();
7474 #else
7475   __kmp_cleanup_user_locks();
7476 #endif
7477 
7478 #if KMP_AFFINITY_SUPPORTED
7479   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7480   __kmp_cpuinfo_file = NULL;
7481 #endif /* KMP_AFFINITY_SUPPORTED */
7482 
7483 #if KMP_USE_ADAPTIVE_LOCKS
7484 #if KMP_DEBUG_ADAPTIVE_LOCKS
7485   __kmp_print_speculative_stats();
7486 #endif
7487 #endif
7488   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7489   __kmp_nested_nth.nth = NULL;
7490   __kmp_nested_nth.size = 0;
7491   __kmp_nested_nth.used = 0;
7492   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7493   __kmp_nested_proc_bind.bind_types = NULL;
7494   __kmp_nested_proc_bind.size = 0;
7495   __kmp_nested_proc_bind.used = 0;
7496 #if OMP_50_ENABLED
7497   if (__kmp_affinity_format) {
7498     KMP_INTERNAL_FREE(__kmp_affinity_format);
7499     __kmp_affinity_format = NULL;
7500   }
7501 #endif
7502 
7503   __kmp_i18n_catclose();
7504 
7505 #if KMP_USE_HIER_SCHED
7506   __kmp_hier_scheds.deallocate();
7507 #endif
7508 
7509 #if KMP_STATS_ENABLED
7510   __kmp_stats_fini();
7511 #endif
7512 
7513   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7514 }
7515 
7516 /* ------------------------------------------------------------------------ */
7517 
7518 int __kmp_ignore_mppbeg(void) {
7519   char *env;
7520 
7521   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7522     if (__kmp_str_match_false(env))
7523       return FALSE;
7524   }
7525   // By default __kmpc_begin() is no-op.
7526   return TRUE;
7527 }
7528 
7529 int __kmp_ignore_mppend(void) {
7530   char *env;
7531 
7532   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7533     if (__kmp_str_match_false(env))
7534       return FALSE;
7535   }
7536   // By default __kmpc_end() is no-op.
7537   return TRUE;
7538 }
7539 
7540 void __kmp_internal_begin(void) {
7541   int gtid;
7542   kmp_root_t *root;
7543 
7544   /* this is a very important step as it will register new sibling threads
7545      and assign these new uber threads a new gtid */
7546   gtid = __kmp_entry_gtid();
7547   root = __kmp_threads[gtid]->th.th_root;
7548   KMP_ASSERT(KMP_UBER_GTID(gtid));
7549 
7550   if (root->r.r_begin)
7551     return;
7552   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7553   if (root->r.r_begin) {
7554     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7555     return;
7556   }
7557 
7558   root->r.r_begin = TRUE;
7559 
7560   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7561 }
7562 
7563 /* ------------------------------------------------------------------------ */
7564 
7565 void __kmp_user_set_library(enum library_type arg) {
7566   int gtid;
7567   kmp_root_t *root;
7568   kmp_info_t *thread;
7569 
7570   /* first, make sure we are initialized so we can get our gtid */
7571 
7572   gtid = __kmp_entry_gtid();
7573   thread = __kmp_threads[gtid];
7574 
7575   root = thread->th.th_root;
7576 
7577   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7578                 library_serial));
7579   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7580                                   thread */
7581     KMP_WARNING(SetLibraryIncorrectCall);
7582     return;
7583   }
7584 
7585   switch (arg) {
7586   case library_serial:
7587     thread->th.th_set_nproc = 0;
7588     set__nproc(thread, 1);
7589     break;
7590   case library_turnaround:
7591     thread->th.th_set_nproc = 0;
7592     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7593                                            : __kmp_dflt_team_nth_ub);
7594     break;
7595   case library_throughput:
7596     thread->th.th_set_nproc = 0;
7597     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7598                                            : __kmp_dflt_team_nth_ub);
7599     break;
7600   default:
7601     KMP_FATAL(UnknownLibraryType, arg);
7602   }
7603 
7604   __kmp_aux_set_library(arg);
7605 }
7606 
7607 void __kmp_aux_set_stacksize(size_t arg) {
7608   if (!__kmp_init_serial)
7609     __kmp_serial_initialize();
7610 
7611 #if KMP_OS_DARWIN
7612   if (arg & (0x1000 - 1)) {
7613     arg &= ~(0x1000 - 1);
7614     if (arg + 0x1000) /* check for overflow if we round up */
7615       arg += 0x1000;
7616   }
7617 #endif
7618   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7619 
7620   /* only change the default stacksize before the first parallel region */
7621   if (!TCR_4(__kmp_init_parallel)) {
7622     size_t value = arg; /* argument is in bytes */
7623 
7624     if (value < __kmp_sys_min_stksize)
7625       value = __kmp_sys_min_stksize;
7626     else if (value > KMP_MAX_STKSIZE)
7627       value = KMP_MAX_STKSIZE;
7628 
7629     __kmp_stksize = value;
7630 
7631     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7632   }
7633 
7634   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7635 }
7636 
7637 /* set the behaviour of the runtime library */
7638 /* TODO this can cause some odd behaviour with sibling parallelism... */
7639 void __kmp_aux_set_library(enum library_type arg) {
7640   __kmp_library = arg;
7641 
7642   switch (__kmp_library) {
7643   case library_serial: {
7644     KMP_INFORM(LibraryIsSerial);
7645     (void)__kmp_change_library(TRUE);
7646   } break;
7647   case library_turnaround:
7648     (void)__kmp_change_library(TRUE);
7649     break;
7650   case library_throughput:
7651     (void)__kmp_change_library(FALSE);
7652     break;
7653   default:
7654     KMP_FATAL(UnknownLibraryType, arg);
7655   }
7656 }
7657 
7658 /* Getting team information common for all team API */
7659 // Returns NULL if not in teams construct
7660 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7661   kmp_info_t *thr = __kmp_entry_thread();
7662   teams_serialized = 0;
7663   if (thr->th.th_teams_microtask) {
7664     kmp_team_t *team = thr->th.th_team;
7665     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7666     int ii = team->t.t_level;
7667     teams_serialized = team->t.t_serialized;
7668     int level = tlevel + 1;
7669     KMP_DEBUG_ASSERT(ii >= tlevel);
7670     while (ii > level) {
7671       for (teams_serialized = team->t.t_serialized;
7672            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7673       }
7674       if (team->t.t_serialized && (!teams_serialized)) {
7675         team = team->t.t_parent;
7676         continue;
7677       }
7678       if (ii > level) {
7679         team = team->t.t_parent;
7680         ii--;
7681       }
7682     }
7683     return team;
7684   }
7685   return NULL;
7686 }
7687 
7688 int __kmp_aux_get_team_num() {
7689   int serialized;
7690   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7691   if (team) {
7692     if (serialized > 1) {
7693       return 0; // teams region is serialized ( 1 team of 1 thread ).
7694     } else {
7695       return team->t.t_master_tid;
7696     }
7697   }
7698   return 0;
7699 }
7700 
7701 int __kmp_aux_get_num_teams() {
7702   int serialized;
7703   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7704   if (team) {
7705     if (serialized > 1) {
7706       return 1;
7707     } else {
7708       return team->t.t_parent->t.t_nproc;
7709     }
7710   }
7711   return 1;
7712 }
7713 
7714 /* ------------------------------------------------------------------------ */
7715 
7716 #if OMP_50_ENABLED
7717 /*
7718  * Affinity Format Parser
7719  *
7720  * Field is in form of: %[[[0].]size]type
7721  * % and type are required (%% means print a literal '%')
7722  * type is either single char or long name surrounded by {},
7723  * e.g., N or {num_threads}
7724  * 0 => leading zeros
7725  * . => right justified when size is specified
7726  * by default output is left justified
7727  * size is the *minimum* field length
7728  * All other characters are printed as is
7729  *
7730  * Available field types:
7731  * L {thread_level}      - omp_get_level()
7732  * n {thread_num}        - omp_get_thread_num()
7733  * h {host}              - name of host machine
7734  * P {process_id}        - process id (integer)
7735  * T {thread_identifier} - native thread identifier (integer)
7736  * N {num_threads}       - omp_get_num_threads()
7737  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7738  * a {thread_affinity}   - comma separated list of integers or integer ranges
7739  *                         (values of affinity mask)
7740  *
7741  * Implementation-specific field types can be added
7742  * If a type is unknown, print "undefined"
7743 */
7744 
7745 // Structure holding the short name, long name, and corresponding data type
7746 // for snprintf.  A table of these will represent the entire valid keyword
7747 // field types.
7748 typedef struct kmp_affinity_format_field_t {
7749   char short_name; // from spec e.g., L -> thread level
7750   const char *long_name; // from spec thread_level -> thread level
7751   char field_format; // data type for snprintf (typically 'd' or 's'
7752   // for integer or string)
7753 } kmp_affinity_format_field_t;
7754 
7755 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7756 #if KMP_AFFINITY_SUPPORTED
7757     {'A', "thread_affinity", 's'},
7758 #endif
7759     {'t', "team_num", 'd'},
7760     {'T', "num_teams", 'd'},
7761     {'L', "nesting_level", 'd'},
7762     {'n', "thread_num", 'd'},
7763     {'N', "num_threads", 'd'},
7764     {'a', "ancestor_tnum", 'd'},
7765     {'H', "host", 's'},
7766     {'P', "process_id", 'd'},
7767     {'i', "native_thread_id", 'd'}};
7768 
7769 // Return the number of characters it takes to hold field
7770 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7771                                             const char **ptr,
7772                                             kmp_str_buf_t *field_buffer) {
7773   int rc, format_index, field_value;
7774   const char *width_left, *width_right;
7775   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7776   static const int FORMAT_SIZE = 20;
7777   char format[FORMAT_SIZE] = {0};
7778   char absolute_short_name = 0;
7779 
7780   KMP_DEBUG_ASSERT(gtid >= 0);
7781   KMP_DEBUG_ASSERT(th);
7782   KMP_DEBUG_ASSERT(**ptr == '%');
7783   KMP_DEBUG_ASSERT(field_buffer);
7784 
7785   __kmp_str_buf_clear(field_buffer);
7786 
7787   // Skip the initial %
7788   (*ptr)++;
7789 
7790   // Check for %% first
7791   if (**ptr == '%') {
7792     __kmp_str_buf_cat(field_buffer, "%", 1);
7793     (*ptr)++; // skip over the second %
7794     return 1;
7795   }
7796 
7797   // Parse field modifiers if they are present
7798   pad_zeros = false;
7799   if (**ptr == '0') {
7800     pad_zeros = true;
7801     (*ptr)++; // skip over 0
7802   }
7803   right_justify = false;
7804   if (**ptr == '.') {
7805     right_justify = true;
7806     (*ptr)++; // skip over .
7807   }
7808   // Parse width of field: [width_left, width_right)
7809   width_left = width_right = NULL;
7810   if (**ptr >= '0' && **ptr <= '9') {
7811     width_left = *ptr;
7812     SKIP_DIGITS(*ptr);
7813     width_right = *ptr;
7814   }
7815 
7816   // Create the format for KMP_SNPRINTF based on flags parsed above
7817   format_index = 0;
7818   format[format_index++] = '%';
7819   if (!right_justify)
7820     format[format_index++] = '-';
7821   if (pad_zeros)
7822     format[format_index++] = '0';
7823   if (width_left && width_right) {
7824     int i = 0;
7825     // Only allow 8 digit number widths.
7826     // This also prevents overflowing format variable
7827     while (i < 8 && width_left < width_right) {
7828       format[format_index++] = *width_left;
7829       width_left++;
7830       i++;
7831     }
7832   }
7833 
7834   // Parse a name (long or short)
7835   // Canonicalize the name into absolute_short_name
7836   found_valid_name = false;
7837   parse_long_name = (**ptr == '{');
7838   if (parse_long_name)
7839     (*ptr)++; // skip initial left brace
7840   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7841                              sizeof(__kmp_affinity_format_table[0]);
7842        ++i) {
7843     char short_name = __kmp_affinity_format_table[i].short_name;
7844     const char *long_name = __kmp_affinity_format_table[i].long_name;
7845     char field_format = __kmp_affinity_format_table[i].field_format;
7846     if (parse_long_name) {
7847       int length = KMP_STRLEN(long_name);
7848       if (strncmp(*ptr, long_name, length) == 0) {
7849         found_valid_name = true;
7850         (*ptr) += length; // skip the long name
7851       }
7852     } else if (**ptr == short_name) {
7853       found_valid_name = true;
7854       (*ptr)++; // skip the short name
7855     }
7856     if (found_valid_name) {
7857       format[format_index++] = field_format;
7858       format[format_index++] = '\0';
7859       absolute_short_name = short_name;
7860       break;
7861     }
7862   }
7863   if (parse_long_name) {
7864     if (**ptr != '}') {
7865       absolute_short_name = 0;
7866     } else {
7867       (*ptr)++; // skip over the right brace
7868     }
7869   }
7870 
7871   // Attempt to fill the buffer with the requested
7872   // value using snprintf within __kmp_str_buf_print()
7873   switch (absolute_short_name) {
7874   case 't':
7875     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7876     break;
7877   case 'T':
7878     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7879     break;
7880   case 'L':
7881     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7882     break;
7883   case 'n':
7884     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7885     break;
7886   case 'H': {
7887     static const int BUFFER_SIZE = 256;
7888     char buf[BUFFER_SIZE];
7889     __kmp_expand_host_name(buf, BUFFER_SIZE);
7890     rc = __kmp_str_buf_print(field_buffer, format, buf);
7891   } break;
7892   case 'P':
7893     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7894     break;
7895   case 'i':
7896     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7897     break;
7898   case 'N':
7899     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7900     break;
7901   case 'a':
7902     field_value =
7903         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7904     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7905     break;
7906 #if KMP_AFFINITY_SUPPORTED
7907   case 'A': {
7908     kmp_str_buf_t buf;
7909     __kmp_str_buf_init(&buf);
7910     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7911     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7912     __kmp_str_buf_free(&buf);
7913   } break;
7914 #endif
7915   default:
7916     // According to spec, If an implementation does not have info for field
7917     // type, then "undefined" is printed
7918     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7919     // Skip the field
7920     if (parse_long_name) {
7921       SKIP_TOKEN(*ptr);
7922       if (**ptr == '}')
7923         (*ptr)++;
7924     } else {
7925       (*ptr)++;
7926     }
7927   }
7928 
7929   KMP_ASSERT(format_index <= FORMAT_SIZE);
7930   return rc;
7931 }
7932 
7933 /*
7934  * Return number of characters needed to hold the affinity string
7935  * (not including null byte character)
7936  * The resultant string is printed to buffer, which the caller can then
7937  * handle afterwards
7938 */
7939 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7940                                   kmp_str_buf_t *buffer) {
7941   const char *parse_ptr;
7942   size_t retval;
7943   const kmp_info_t *th;
7944   kmp_str_buf_t field;
7945 
7946   KMP_DEBUG_ASSERT(buffer);
7947   KMP_DEBUG_ASSERT(gtid >= 0);
7948 
7949   __kmp_str_buf_init(&field);
7950   __kmp_str_buf_clear(buffer);
7951 
7952   th = __kmp_threads[gtid];
7953   retval = 0;
7954 
7955   // If format is NULL or zero-length string, then we use
7956   // affinity-format-var ICV
7957   parse_ptr = format;
7958   if (parse_ptr == NULL || *parse_ptr == '\0') {
7959     parse_ptr = __kmp_affinity_format;
7960   }
7961   KMP_DEBUG_ASSERT(parse_ptr);
7962 
7963   while (*parse_ptr != '\0') {
7964     // Parse a field
7965     if (*parse_ptr == '%') {
7966       // Put field in the buffer
7967       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7968       __kmp_str_buf_catbuf(buffer, &field);
7969       retval += rc;
7970     } else {
7971       // Put literal character in buffer
7972       __kmp_str_buf_cat(buffer, parse_ptr, 1);
7973       retval++;
7974       parse_ptr++;
7975     }
7976   }
7977   __kmp_str_buf_free(&field);
7978   return retval;
7979 }
7980 
7981 // Displays the affinity string to stdout
7982 void __kmp_aux_display_affinity(int gtid, const char *format) {
7983   kmp_str_buf_t buf;
7984   __kmp_str_buf_init(&buf);
7985   __kmp_aux_capture_affinity(gtid, format, &buf);
7986   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7987   __kmp_str_buf_free(&buf);
7988 }
7989 #endif // OMP_50_ENABLED
7990 
7991 /* ------------------------------------------------------------------------ */
7992 
7993 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7994   int blocktime = arg; /* argument is in milliseconds */
7995 #if KMP_USE_MONITOR
7996   int bt_intervals;
7997 #endif
7998   int bt_set;
7999 
8000   __kmp_save_internal_controls(thread);
8001 
8002   /* Normalize and set blocktime for the teams */
8003   if (blocktime < KMP_MIN_BLOCKTIME)
8004     blocktime = KMP_MIN_BLOCKTIME;
8005   else if (blocktime > KMP_MAX_BLOCKTIME)
8006     blocktime = KMP_MAX_BLOCKTIME;
8007 
8008   set__blocktime_team(thread->th.th_team, tid, blocktime);
8009   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8010 
8011 #if KMP_USE_MONITOR
8012   /* Calculate and set blocktime intervals for the teams */
8013   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8014 
8015   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8016   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8017 #endif
8018 
8019   /* Set whether blocktime has been set to "TRUE" */
8020   bt_set = TRUE;
8021 
8022   set__bt_set_team(thread->th.th_team, tid, bt_set);
8023   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8024 #if KMP_USE_MONITOR
8025   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8026                 "bt_intervals=%d, monitor_updates=%d\n",
8027                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8028                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8029                 __kmp_monitor_wakeups));
8030 #else
8031   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8032                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8033                 thread->th.th_team->t.t_id, tid, blocktime));
8034 #endif
8035 }
8036 
8037 void __kmp_aux_set_defaults(char const *str, int len) {
8038   if (!__kmp_init_serial) {
8039     __kmp_serial_initialize();
8040   }
8041   __kmp_env_initialize(str);
8042 
8043   if (__kmp_settings
8044 #if OMP_40_ENABLED
8045       || __kmp_display_env || __kmp_display_env_verbose
8046 #endif // OMP_40_ENABLED
8047       ) {
8048     __kmp_env_print();
8049   }
8050 } // __kmp_aux_set_defaults
8051 
8052 /* ------------------------------------------------------------------------ */
8053 /* internal fast reduction routines */
8054 
8055 PACKED_REDUCTION_METHOD_T
8056 __kmp_determine_reduction_method(
8057     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8058     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8059     kmp_critical_name *lck) {
8060 
8061   // Default reduction method: critical construct ( lck != NULL, like in current
8062   // PAROPT )
8063   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8064   // can be selected by RTL
8065   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8066   // can be selected by RTL
8067   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8068   // among generated by PAROPT.
8069 
8070   PACKED_REDUCTION_METHOD_T retval;
8071 
8072   int team_size;
8073 
8074   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8075   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8076 
8077 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8078   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8079 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8080 
8081   retval = critical_reduce_block;
8082 
8083   // another choice of getting a team size (with 1 dynamic deference) is slower
8084   team_size = __kmp_get_team_num_threads(global_tid);
8085   if (team_size == 1) {
8086 
8087     retval = empty_reduce_block;
8088 
8089   } else {
8090 
8091     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8092 
8093 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8094 
8095 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8096     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8097 
8098     int teamsize_cutoff = 4;
8099 
8100 #if KMP_MIC_SUPPORTED
8101     if (__kmp_mic_type != non_mic) {
8102       teamsize_cutoff = 8;
8103     }
8104 #endif
8105     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8106     if (tree_available) {
8107       if (team_size <= teamsize_cutoff) {
8108         if (atomic_available) {
8109           retval = atomic_reduce_block;
8110         }
8111       } else {
8112         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8113       }
8114     } else if (atomic_available) {
8115       retval = atomic_reduce_block;
8116     }
8117 #else
8118 #error "Unknown or unsupported OS"
8119 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8120        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8121 
8122 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8123 
8124 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD
8125 
8126     // basic tuning
8127 
8128     if (atomic_available) {
8129       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8130         retval = atomic_reduce_block;
8131       }
8132     } // otherwise: use critical section
8133 
8134 #elif KMP_OS_DARWIN
8135 
8136     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8137     if (atomic_available && (num_vars <= 3)) {
8138       retval = atomic_reduce_block;
8139     } else if (tree_available) {
8140       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8141           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8142         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8143       }
8144     } // otherwise: use critical section
8145 
8146 #else
8147 #error "Unknown or unsupported OS"
8148 #endif
8149 
8150 #else
8151 #error "Unknown or unsupported architecture"
8152 #endif
8153   }
8154 
8155   // KMP_FORCE_REDUCTION
8156 
8157   // If the team is serialized (team_size == 1), ignore the forced reduction
8158   // method and stay with the unsynchronized method (empty_reduce_block)
8159   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8160       team_size != 1) {
8161 
8162     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8163 
8164     int atomic_available, tree_available;
8165 
8166     switch ((forced_retval = __kmp_force_reduction_method)) {
8167     case critical_reduce_block:
8168       KMP_ASSERT(lck); // lck should be != 0
8169       break;
8170 
8171     case atomic_reduce_block:
8172       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8173       if (!atomic_available) {
8174         KMP_WARNING(RedMethodNotSupported, "atomic");
8175         forced_retval = critical_reduce_block;
8176       }
8177       break;
8178 
8179     case tree_reduce_block:
8180       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8181       if (!tree_available) {
8182         KMP_WARNING(RedMethodNotSupported, "tree");
8183         forced_retval = critical_reduce_block;
8184       } else {
8185 #if KMP_FAST_REDUCTION_BARRIER
8186         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8187 #endif
8188       }
8189       break;
8190 
8191     default:
8192       KMP_ASSERT(0); // "unsupported method specified"
8193     }
8194 
8195     retval = forced_retval;
8196   }
8197 
8198   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8199 
8200 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8201 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8202 
8203   return (retval);
8204 }
8205 
8206 // this function is for testing set/get/determine reduce method
8207 kmp_int32 __kmp_get_reduce_method(void) {
8208   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8209 }
8210 
8211 #if OMP_50_ENABLED
8212 
8213 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8214 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8215 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8216 
8217 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8218 // OpenMP is used subsequently.
8219 void __kmp_hard_pause() {
8220   __kmp_pause_status = kmp_hard_paused;
8221   __kmp_internal_end_thread(-1);
8222 }
8223 
8224 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8225 void __kmp_resume_if_soft_paused() {
8226   if (__kmp_pause_status == kmp_soft_paused) {
8227     __kmp_pause_status = kmp_not_paused;
8228 
8229     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8230       kmp_info_t *thread = __kmp_threads[gtid];
8231       if (thread) { // Wake it if sleeping
8232         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8233         if (fl.is_sleeping())
8234           fl.resume(gtid);
8235         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8236           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8237         } else { // thread holds the lock and may sleep soon
8238           do { // until either the thread sleeps, or we can get the lock
8239             if (fl.is_sleeping()) {
8240               fl.resume(gtid);
8241               break;
8242             } else if (__kmp_try_suspend_mx(thread)) {
8243               __kmp_unlock_suspend_mx(thread);
8244               break;
8245             }
8246           } while (1);
8247         }
8248       }
8249     }
8250   }
8251 }
8252 
8253 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8254 // TODO: add warning messages
8255 int __kmp_pause_resource(kmp_pause_status_t level) {
8256   if (level == kmp_not_paused) { // requesting resume
8257     if (__kmp_pause_status == kmp_not_paused) {
8258       // error message about runtime not being paused, so can't resume
8259       return 1;
8260     } else {
8261       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8262                        __kmp_pause_status == kmp_hard_paused);
8263       __kmp_pause_status = kmp_not_paused;
8264       return 0;
8265     }
8266   } else if (level == kmp_soft_paused) { // requesting soft pause
8267     if (__kmp_pause_status != kmp_not_paused) {
8268       // error message about already being paused
8269       return 1;
8270     } else {
8271       __kmp_soft_pause();
8272       return 0;
8273     }
8274   } else if (level == kmp_hard_paused) { // requesting hard pause
8275     if (__kmp_pause_status != kmp_not_paused) {
8276       // error message about already being paused
8277       return 1;
8278     } else {
8279       __kmp_hard_pause();
8280       return 0;
8281     }
8282   } else {
8283     // error message about invalid level
8284     return 1;
8285   }
8286 }
8287 
8288 #endif // OMP_50_ENABLED
8289