1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55     KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63     KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79                                   int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81                                   kmp_internal_control_t *new_icvs,
82                                   ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85                                    int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91                           kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109   int i;
110   kmp_info_t **other_threads;
111   size_t stack_data;
112   char *stack_addr;
113   size_t stack_size;
114   char *stack_base;
115 
116   KA_TRACE(
117       1000,
118       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
119        __kmp_nth, __kmp_all_nth));
120 
121   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124      __kmp_init_gtid for this to work. */
125 
126   if (!TCR_4(__kmp_init_gtid))
127     return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130   if (TCR_4(__kmp_gtid_mode) >= 3) {
131     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132     return __kmp_gtid;
133   }
134 #endif
135   if (TCR_4(__kmp_gtid_mode) >= 2) {
136     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137     return __kmp_gtid_get_specific();
138   }
139   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141   stack_addr = (char *)&stack_data;
142   other_threads = __kmp_threads;
143 
144   /* ATT: The code below is a source of potential bugs due to unsynchronized
145      access to __kmp_threads array. For example:
146      1. Current thread loads other_threads[i] to thr and checks it, it is
147         non-NULL.
148      2. Current thread is suspended by OS.
149      3. Another thread unregisters and finishes (debug versions of free()
150         may fill memory with something like 0xEF).
151      4. Current thread is resumed.
152      5. Current thread reads junk from *thr.
153      TODO: Fix it.  --ln  */
154 
155   for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158     if (!thr)
159       continue;
160 
161     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164     /* stack grows down -- search through all of the active threads */
165 
166     if (stack_addr <= stack_base) {
167       size_t stack_diff = stack_base - stack_addr;
168 
169       if (stack_diff <= stack_size) {
170         /* The only way we can be closer than the allocated */
171         /* stack size is if we are running on this thread. */
172         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173         return i;
174       }
175     }
176   }
177 
178   /* get specific to try and determine our gtid */
179   KA_TRACE(1000,
180            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181             "thread, using TLS\n"));
182   i = __kmp_gtid_get_specific();
183 
184   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
185 
186   /* if we havn't been assigned a gtid, then return code */
187   if (i < 0)
188     return i;
189 
190   /* dynamically updated stack window for uber threads to avoid get_specific
191      call */
192   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193     KMP_FATAL(StackOverflow, i);
194   }
195 
196   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197   if (stack_addr > stack_base) {
198     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201                 stack_base);
202   } else {
203     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204             stack_base - stack_addr);
205   }
206 
207   /* Reprint stack bounds for ubermaster since they have been refined */
208   if (__kmp_storage_map) {
209     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212                                  other_threads[i]->th.th_info.ds.ds_stacksize,
213                                  "th_%d stack (refinement)", i);
214   }
215   return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219   int gtid;
220 
221   if (!__kmp_init_serial) {
222     gtid = KMP_GTID_DNE;
223   } else
224 #ifdef KMP_TDATA_GTID
225       if (TCR_4(__kmp_gtid_mode) >= 3) {
226     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227     gtid = __kmp_gtid;
228   } else
229 #endif
230       if (TCR_4(__kmp_gtid_mode) >= 2) {
231     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232     gtid = __kmp_gtid_get_specific();
233   } else {
234     KA_TRACE(1000,
235              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236     gtid = __kmp_get_global_thread_id();
237   }
238 
239   /* we must be a new uber master sibling thread */
240   if (gtid == KMP_GTID_DNE) {
241     KA_TRACE(10,
242              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243               "Registering a new gtid.\n"));
244     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245     if (!__kmp_init_serial) {
246       __kmp_do_serial_initialize();
247       gtid = __kmp_gtid_get_specific();
248     } else {
249       gtid = __kmp_register_root(FALSE);
250     }
251     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253   }
254 
255   KMP_DEBUG_ASSERT(gtid >= 0);
256 
257   return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262   int f;
263   char *stack_beg = NULL;
264   char *stack_end = NULL;
265   int gtid;
266 
267   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268   if (__kmp_storage_map) {
269     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272     gtid = __kmp_gtid_from_thread(th);
273 
274     if (gtid == KMP_GTID_MONITOR) {
275       __kmp_print_storage_map_gtid(
276           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277           "th_%s stack (%s)", "mon",
278           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279     } else {
280       __kmp_print_storage_map_gtid(
281           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282           "th_%d stack (%s)", gtid,
283           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284     }
285   }
286 
287   /* No point in checking ubermaster threads since they use refinement and
288    * cannot overlap */
289   gtid = __kmp_gtid_from_thread(th);
290   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291     KA_TRACE(10,
292              ("__kmp_check_stack_overlap: performing extensive checking\n"));
293     if (stack_beg == NULL) {
294       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296     }
297 
298     for (f = 0; f < __kmp_threads_capacity; f++) {
299       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301       if (f_th && f_th != th) {
302         char *other_stack_end =
303             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304         char *other_stack_beg =
305             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309           /* Print the other stack values before the abort */
310           if (__kmp_storage_map)
311             __kmp_print_storage_map_gtid(
312                 -1, other_stack_beg, other_stack_end,
313                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317                       __kmp_msg_null);
318         }
319       }
320     }
321   }
322   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328   static int done = FALSE;
329 
330   while (!done) {
331     KMP_YIELD(TRUE);
332   }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338                                   char const *format, ...) {
339   char buffer[MAX_MESSAGE];
340   va_list ap;
341 
342   va_start(ap, format);
343   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344                p2, (unsigned long)size, format);
345   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346   __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348   int node;
349   if (gtid >= 0) {
350     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351       if (__kmp_storage_map_verbose) {
352         node = __kmp_get_host_node(p1);
353         if (node < 0) /* doesn't work, so don't try this next time */
354           __kmp_storage_map_verbose = FALSE;
355         else {
356           char *last;
357           int lastNode;
358           int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360           const int page_size = KMP_GET_PAGE_SIZE();
361 
362           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364           if (localProc >= 0)
365             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
366                                  localProc >> 1);
367           else
368             __kmp_printf_no_lock("  GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370           /* The more elaborate format is disabled for now because of the prctl
371            * hanging bug. */
372           do {
373             last = p1;
374             lastNode = node;
375             /* This loop collates adjacent pages with the same host node. */
376             do {
377               (char *)p1 += page_size;
378             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
380                                  lastNode);
381           } while (p1 <= p2);
382 #else
383           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
384                                (char *)p1 + (page_size - 1),
385                                __kmp_get_host_node(p1));
386           if (p1 < p2) {
387             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
388                                  (char *)p2 + (page_size - 1),
389                                  __kmp_get_host_node(p2));
390           }
391 #endif
392         }
393       }
394     } else
395       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
396   }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402   char buffer[MAX_MESSAGE];
403   va_list ap;
404 
405   if (__kmp_generate_warnings == kmp_warnings_off) {
406     return;
407   }
408 
409   va_start(ap, format);
410 
411   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413   __kmp_vprintf(kmp_err, buffer, ap);
414   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416   va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420   // Later threads may stall here, but that's ok because abort() will kill them.
421   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423   if (__kmp_debug_buf) {
424     __kmp_dump_debug_buffer();
425   }
426 
427   if (KMP_OS_WINDOWS) {
428     // Let other threads know of abnormal termination and prevent deadlock
429     // if abort happened during library initialization or shutdown
430     __kmp_global.g.g_abort = SIGABRT;
431 
432     /* On Windows* OS by default abort() causes pop-up error box, which stalls
433        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434        boxes. _set_abort_behavior() works well, but this function is not
435        available in VS7 (this is not problem for DLL, but it is a problem for
436        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437        help, at least in some versions of MS C RTL.
438 
439        It seems following sequence is the only way to simulate abort() and
440        avoid pop-up error box. */
441     raise(SIGABRT);
442     _exit(3); // Just in case, if signal ignored, exit anyway.
443   } else {
444     __kmp_unregister_library();
445     abort();
446   }
447 
448   __kmp_infinite_loop();
449   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
450 
451 } // __kmp_abort_process
452 
453 void __kmp_abort_thread(void) {
454   // TODO: Eliminate g_abort global variable and this function.
455   // In case of abort just call abort(), it will kill all the threads.
456   __kmp_infinite_loop();
457 } // __kmp_abort_thread
458 
459 /* Print out the storage map for the major kmp_info_t thread data structures
460    that are allocated together. */
461 
462 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
463   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
464                                gtid);
465 
466   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
467                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
470                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
471 
472   __kmp_print_storage_map_gtid(
473       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
474       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
475 
476   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
477                                &thr->th.th_bar[bs_plain_barrier + 1],
478                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
479                                gtid);
480 
481   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
482                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
483                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
484                                gtid);
485 
486 #if KMP_FAST_REDUCTION_BARRIER
487   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
488                                &thr->th.th_bar[bs_reduction_barrier + 1],
489                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
490                                gtid);
491 #endif // KMP_FAST_REDUCTION_BARRIER
492 }
493 
494 /* Print out the storage map for the major kmp_team_t team data structures
495    that are allocated together. */
496 
497 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
498                                          int team_id, int num_thr) {
499   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
500   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
501                                header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
504                                &team->t.t_bar[bs_last_barrier],
505                                sizeof(kmp_balign_team_t) * bs_last_barrier,
506                                "%s_%d.t_bar", header, team_id);
507 
508   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
509                                &team->t.t_bar[bs_plain_barrier + 1],
510                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
511                                header, team_id);
512 
513   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
514                                &team->t.t_bar[bs_forkjoin_barrier + 1],
515                                sizeof(kmp_balign_team_t),
516                                "%s_%d.t_bar[forkjoin]", header, team_id);
517 
518 #if KMP_FAST_REDUCTION_BARRIER
519   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
520                                &team->t.t_bar[bs_reduction_barrier + 1],
521                                sizeof(kmp_balign_team_t),
522                                "%s_%d.t_bar[reduction]", header, team_id);
523 #endif // KMP_FAST_REDUCTION_BARRIER
524 
525   __kmp_print_storage_map_gtid(
526       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
527       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
528 
529   __kmp_print_storage_map_gtid(
530       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
531       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
532 
533   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
534                                &team->t.t_disp_buffer[num_disp_buff],
535                                sizeof(dispatch_shared_info_t) * num_disp_buff,
536                                "%s_%d.t_disp_buffer", header, team_id);
537 }
538 
539 static void __kmp_init_allocator() { __kmp_init_memkind(); }
540 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
541 
542 /* ------------------------------------------------------------------------ */
543 
544 #if KMP_DYNAMIC_LIB
545 #if KMP_OS_WINDOWS
546 
547 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
548   // TODO: Change to __kmp_break_bootstrap_lock().
549   __kmp_init_bootstrap_lock(lck); // make the lock released
550 }
551 
552 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
553   int i;
554   int thread_count;
555 
556   // PROCESS_DETACH is expected to be called by a thread that executes
557   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
558   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
559   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
560   // threads can be still alive here, although being about to be terminated. The
561   // threads in the array with ds_thread==0 are most suspicious. Actually, it
562   // can be not safe to access the __kmp_threads[].
563 
564   // TODO: does it make sense to check __kmp_roots[] ?
565 
566   // Let's check that there are no other alive threads registered with the OMP
567   // lib.
568   while (1) {
569     thread_count = 0;
570     for (i = 0; i < __kmp_threads_capacity; ++i) {
571       if (!__kmp_threads)
572         continue;
573       kmp_info_t *th = __kmp_threads[i];
574       if (th == NULL)
575         continue;
576       int gtid = th->th.th_info.ds.ds_gtid;
577       if (gtid == gtid_req)
578         continue;
579       if (gtid < 0)
580         continue;
581       DWORD exit_val;
582       int alive = __kmp_is_thread_alive(th, &exit_val);
583       if (alive) {
584         ++thread_count;
585       }
586     }
587     if (thread_count == 0)
588       break; // success
589   }
590 
591   // Assume that I'm alone. Now it might be safe to check and reset locks.
592   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
593   __kmp_reset_lock(&__kmp_forkjoin_lock);
594 #ifdef KMP_DEBUG
595   __kmp_reset_lock(&__kmp_stdio_lock);
596 #endif // KMP_DEBUG
597 }
598 
599 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
600   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
601 
602   switch (fdwReason) {
603 
604   case DLL_PROCESS_ATTACH:
605     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
606 
607     return TRUE;
608 
609   case DLL_PROCESS_DETACH:
610     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
611 
612     if (lpReserved != NULL) {
613       // lpReserved is used for telling the difference:
614       //   lpReserved == NULL when FreeLibrary() was called,
615       //   lpReserved != NULL when the process terminates.
616       // When FreeLibrary() is called, worker threads remain alive. So they will
617       // release the forkjoin lock by themselves. When the process terminates,
618       // worker threads disappear triggering the problem of unreleased forkjoin
619       // lock as described below.
620 
621       // A worker thread can take the forkjoin lock. The problem comes up if
622       // that worker thread becomes dead before it releases the forkjoin lock.
623       // The forkjoin lock remains taken, while the thread executing
624       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
625       // to take the forkjoin lock and will always fail, so that the application
626       // will never finish [normally]. This scenario is possible if
627       // __kmpc_end() has not been executed. It looks like it's not a corner
628       // case, but common cases:
629       // - the main function was compiled by an alternative compiler;
630       // - the main function was compiled by icl but without /Qopenmp
631       //   (application with plugins);
632       // - application terminates by calling C exit(), Fortran CALL EXIT() or
633       //   Fortran STOP.
634       // - alive foreign thread prevented __kmpc_end from doing cleanup.
635       //
636       // This is a hack to work around the problem.
637       // TODO: !!! figure out something better.
638       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
639     }
640 
641     __kmp_internal_end_library(__kmp_gtid_get_specific());
642 
643     return TRUE;
644 
645   case DLL_THREAD_ATTACH:
646     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
647 
648     /* if we want to register new siblings all the time here call
649      * __kmp_get_gtid(); */
650     return TRUE;
651 
652   case DLL_THREAD_DETACH:
653     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
654 
655     __kmp_internal_end_thread(__kmp_gtid_get_specific());
656     return TRUE;
657   }
658 
659   return TRUE;
660 }
661 
662 #endif /* KMP_OS_WINDOWS */
663 #endif /* KMP_DYNAMIC_LIB */
664 
665 /* __kmp_parallel_deo -- Wait until it's our turn. */
666 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
667   int gtid = *gtid_ref;
668 #ifdef BUILD_PARALLEL_ORDERED
669   kmp_team_t *team = __kmp_team_from_gtid(gtid);
670 #endif /* BUILD_PARALLEL_ORDERED */
671 
672   if (__kmp_env_consistency_check) {
673     if (__kmp_threads[gtid]->th.th_root->r.r_active)
674 #if KMP_USE_DYNAMIC_LOCK
675       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
676 #else
677       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
678 #endif
679   }
680 #ifdef BUILD_PARALLEL_ORDERED
681   if (!team->t.t_serialized) {
682     KMP_MB();
683     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
684              NULL);
685     KMP_MB();
686   }
687 #endif /* BUILD_PARALLEL_ORDERED */
688 }
689 
690 /* __kmp_parallel_dxo -- Signal the next task. */
691 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
692   int gtid = *gtid_ref;
693 #ifdef BUILD_PARALLEL_ORDERED
694   int tid = __kmp_tid_from_gtid(gtid);
695   kmp_team_t *team = __kmp_team_from_gtid(gtid);
696 #endif /* BUILD_PARALLEL_ORDERED */
697 
698   if (__kmp_env_consistency_check) {
699     if (__kmp_threads[gtid]->th.th_root->r.r_active)
700       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
701   }
702 #ifdef BUILD_PARALLEL_ORDERED
703   if (!team->t.t_serialized) {
704     KMP_MB(); /* Flush all pending memory write invalidates.  */
705 
706     /* use the tid of the next thread in this team */
707     /* TODO replace with general release procedure */
708     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
709 
710     KMP_MB(); /* Flush all pending memory write invalidates.  */
711   }
712 #endif /* BUILD_PARALLEL_ORDERED */
713 }
714 
715 /* ------------------------------------------------------------------------ */
716 /* The BARRIER for a SINGLE process section is always explicit   */
717 
718 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
719   int status;
720   kmp_info_t *th;
721   kmp_team_t *team;
722 
723   if (!TCR_4(__kmp_init_parallel))
724     __kmp_parallel_initialize();
725   __kmp_resume_if_soft_paused();
726 
727   th = __kmp_threads[gtid];
728   team = th->th.th_team;
729   status = 0;
730 
731   th->th.th_ident = id_ref;
732 
733   if (team->t.t_serialized) {
734     status = 1;
735   } else {
736     kmp_int32 old_this = th->th.th_local.this_construct;
737 
738     ++th->th.th_local.this_construct;
739     /* try to set team count to thread count--success means thread got the
740        single block */
741     /* TODO: Should this be acquire or release? */
742     if (team->t.t_construct == old_this) {
743       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
744                                               th->th.th_local.this_construct);
745     }
746 #if USE_ITT_BUILD
747     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
748         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
749         team->t.t_active_level ==
750             1) { // Only report metadata by master of active team at level 1
751       __kmp_itt_metadata_single(id_ref);
752     }
753 #endif /* USE_ITT_BUILD */
754   }
755 
756   if (__kmp_env_consistency_check) {
757     if (status && push_ws) {
758       __kmp_push_workshare(gtid, ct_psingle, id_ref);
759     } else {
760       __kmp_check_workshare(gtid, ct_psingle, id_ref);
761     }
762   }
763 #if USE_ITT_BUILD
764   if (status) {
765     __kmp_itt_single_start(gtid);
766   }
767 #endif /* USE_ITT_BUILD */
768   return status;
769 }
770 
771 void __kmp_exit_single(int gtid) {
772 #if USE_ITT_BUILD
773   __kmp_itt_single_end(gtid);
774 #endif /* USE_ITT_BUILD */
775   if (__kmp_env_consistency_check)
776     __kmp_pop_workshare(gtid, ct_psingle, NULL);
777 }
778 
779 /* determine if we can go parallel or must use a serialized parallel region and
780  * how many threads we can use
781  * set_nproc is the number of threads requested for the team
782  * returns 0 if we should serialize or only use one thread,
783  * otherwise the number of threads to use
784  * The forkjoin lock is held by the caller. */
785 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
786                                  int master_tid, int set_nthreads,
787                                  int enter_teams) {
788   int capacity;
789   int new_nthreads;
790   KMP_DEBUG_ASSERT(__kmp_init_serial);
791   KMP_DEBUG_ASSERT(root && parent_team);
792   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
793 
794   // If dyn-var is set, dynamically adjust the number of desired threads,
795   // according to the method specified by dynamic_mode.
796   new_nthreads = set_nthreads;
797   if (!get__dynamic_2(parent_team, master_tid)) {
798     ;
799   }
800 #ifdef USE_LOAD_BALANCE
801   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
802     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
803     if (new_nthreads == 1) {
804       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
805                     "reservation to 1 thread\n",
806                     master_tid));
807       return 1;
808     }
809     if (new_nthreads < set_nthreads) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
811                     "reservation to %d threads\n",
812                     master_tid, new_nthreads));
813     }
814   }
815 #endif /* USE_LOAD_BALANCE */
816   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
817     new_nthreads = __kmp_avail_proc - __kmp_nth +
818                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
819     if (new_nthreads <= 1) {
820       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
821                     "reservation to 1 thread\n",
822                     master_tid));
823       return 1;
824     }
825     if (new_nthreads < set_nthreads) {
826       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
827                     "reservation to %d threads\n",
828                     master_tid, new_nthreads));
829     } else {
830       new_nthreads = set_nthreads;
831     }
832   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
833     if (set_nthreads > 2) {
834       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
835       new_nthreads = (new_nthreads % set_nthreads) + 1;
836       if (new_nthreads == 1) {
837         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
838                       "reservation to 1 thread\n",
839                       master_tid));
840         return 1;
841       }
842       if (new_nthreads < set_nthreads) {
843         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
844                       "reservation to %d threads\n",
845                       master_tid, new_nthreads));
846       }
847     }
848   } else {
849     KMP_ASSERT(0);
850   }
851 
852   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
853   if (__kmp_nth + new_nthreads -
854           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
855       __kmp_max_nth) {
856     int tl_nthreads = __kmp_max_nth - __kmp_nth +
857                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
858     if (tl_nthreads <= 0) {
859       tl_nthreads = 1;
860     }
861 
862     // If dyn-var is false, emit a 1-time warning.
863     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
864       __kmp_reserve_warn = 1;
865       __kmp_msg(kmp_ms_warning,
866                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
867                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
868     }
869     if (tl_nthreads == 1) {
870       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
871                     "reduced reservation to 1 thread\n",
872                     master_tid));
873       return 1;
874     }
875     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
876                   "reservation to %d threads\n",
877                   master_tid, tl_nthreads));
878     new_nthreads = tl_nthreads;
879   }
880 
881   // Respect OMP_THREAD_LIMIT
882   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
883   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
884   if (cg_nthreads + new_nthreads -
885           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
886       max_cg_threads) {
887     int tl_nthreads = max_cg_threads - cg_nthreads +
888                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
889     if (tl_nthreads <= 0) {
890       tl_nthreads = 1;
891     }
892 
893     // If dyn-var is false, emit a 1-time warning.
894     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
895       __kmp_reserve_warn = 1;
896       __kmp_msg(kmp_ms_warning,
897                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
898                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
899     }
900     if (tl_nthreads == 1) {
901       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
902                     "reduced reservation to 1 thread\n",
903                     master_tid));
904       return 1;
905     }
906     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
907                   "reservation to %d threads\n",
908                   master_tid, tl_nthreads));
909     new_nthreads = tl_nthreads;
910   }
911 
912   // Check if the threads array is large enough, or needs expanding.
913   // See comment in __kmp_register_root() about the adjustment if
914   // __kmp_threads[0] == NULL.
915   capacity = __kmp_threads_capacity;
916   if (TCR_PTR(__kmp_threads[0]) == NULL) {
917     --capacity;
918   }
919   if (__kmp_nth + new_nthreads -
920           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
921       capacity) {
922     // Expand the threads array.
923     int slotsRequired = __kmp_nth + new_nthreads -
924                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
925                         capacity;
926     int slotsAdded = __kmp_expand_threads(slotsRequired);
927     if (slotsAdded < slotsRequired) {
928       // The threads array was not expanded enough.
929       new_nthreads -= (slotsRequired - slotsAdded);
930       KMP_ASSERT(new_nthreads >= 1);
931 
932       // If dyn-var is false, emit a 1-time warning.
933       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
934         __kmp_reserve_warn = 1;
935         if (__kmp_tp_cached) {
936           __kmp_msg(kmp_ms_warning,
937                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
938                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
939                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
940         } else {
941           __kmp_msg(kmp_ms_warning,
942                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
943                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
944         }
945       }
946     }
947   }
948 
949 #ifdef KMP_DEBUG
950   if (new_nthreads == 1) {
951     KC_TRACE(10,
952              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
953               "dead roots and rechecking; requested %d threads\n",
954               __kmp_get_gtid(), set_nthreads));
955   } else {
956     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
957                   " %d threads\n",
958                   __kmp_get_gtid(), new_nthreads, set_nthreads));
959   }
960 #endif // KMP_DEBUG
961   return new_nthreads;
962 }
963 
964 /* Allocate threads from the thread pool and assign them to the new team. We are
965    assured that there are enough threads available, because we checked on that
966    earlier within critical section forkjoin */
967 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
968                                     kmp_info_t *master_th, int master_gtid) {
969   int i;
970   int use_hot_team;
971 
972   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
973   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
974   KMP_MB();
975 
976   /* first, let's setup the master thread */
977   master_th->th.th_info.ds.ds_tid = 0;
978   master_th->th.th_team = team;
979   master_th->th.th_team_nproc = team->t.t_nproc;
980   master_th->th.th_team_master = master_th;
981   master_th->th.th_team_serialized = FALSE;
982   master_th->th.th_dispatch = &team->t.t_dispatch[0];
983 
984 /* make sure we are not the optimized hot team */
985 #if KMP_NESTED_HOT_TEAMS
986   use_hot_team = 0;
987   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
988   if (hot_teams) { // hot teams array is not allocated if
989     // KMP_HOT_TEAMS_MAX_LEVEL=0
990     int level = team->t.t_active_level - 1; // index in array of hot teams
991     if (master_th->th.th_teams_microtask) { // are we inside the teams?
992       if (master_th->th.th_teams_size.nteams > 1) {
993         ++level; // level was not increased in teams construct for
994         // team_of_masters
995       }
996       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
997           master_th->th.th_teams_level == team->t.t_level) {
998         ++level; // level was not increased in teams construct for
999         // team_of_workers before the parallel
1000       } // team->t.t_level will be increased inside parallel
1001     }
1002     if (level < __kmp_hot_teams_max_level) {
1003       if (hot_teams[level].hot_team) {
1004         // hot team has already been allocated for given level
1005         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1006         use_hot_team = 1; // the team is ready to use
1007       } else {
1008         use_hot_team = 0; // AC: threads are not allocated yet
1009         hot_teams[level].hot_team = team; // remember new hot team
1010         hot_teams[level].hot_team_nth = team->t.t_nproc;
1011       }
1012     } else {
1013       use_hot_team = 0;
1014     }
1015   }
1016 #else
1017   use_hot_team = team == root->r.r_hot_team;
1018 #endif
1019   if (!use_hot_team) {
1020 
1021     /* install the master thread */
1022     team->t.t_threads[0] = master_th;
1023     __kmp_initialize_info(master_th, team, 0, master_gtid);
1024 
1025     /* now, install the worker threads */
1026     for (i = 1; i < team->t.t_nproc; i++) {
1027 
1028       /* fork or reallocate a new thread and install it in team */
1029       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1030       team->t.t_threads[i] = thr;
1031       KMP_DEBUG_ASSERT(thr);
1032       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1033       /* align team and thread arrived states */
1034       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1035                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1036                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1037                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1038                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1039                     team->t.t_bar[bs_plain_barrier].b_arrived));
1040       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1041       thr->th.th_teams_level = master_th->th.th_teams_level;
1042       thr->th.th_teams_size = master_th->th.th_teams_size;
1043       { // Initialize threads' barrier data.
1044         int b;
1045         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1046         for (b = 0; b < bs_last_barrier; ++b) {
1047           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1048           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1049 #if USE_DEBUGGER
1050           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1051 #endif
1052         }
1053       }
1054     }
1055 
1056 #if KMP_AFFINITY_SUPPORTED
1057     __kmp_partition_places(team);
1058 #endif
1059   }
1060 
1061   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1062     for (i = 0; i < team->t.t_nproc; i++) {
1063       kmp_info_t *thr = team->t.t_threads[i];
1064       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1065           thr->th.th_prev_level != team->t.t_level) {
1066         team->t.t_display_affinity = 1;
1067         break;
1068       }
1069     }
1070   }
1071 
1072   KMP_MB();
1073 }
1074 
1075 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1076 // Propagate any changes to the floating point control registers out to the team
1077 // We try to avoid unnecessary writes to the relevant cache line in the team
1078 // structure, so we don't make changes unless they are needed.
1079 inline static void propagateFPControl(kmp_team_t *team) {
1080   if (__kmp_inherit_fp_control) {
1081     kmp_int16 x87_fpu_control_word;
1082     kmp_uint32 mxcsr;
1083 
1084     // Get master values of FPU control flags (both X87 and vector)
1085     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1086     __kmp_store_mxcsr(&mxcsr);
1087     mxcsr &= KMP_X86_MXCSR_MASK;
1088 
1089     // There is no point looking at t_fp_control_saved here.
1090     // If it is TRUE, we still have to update the values if they are different
1091     // from those we now have. If it is FALSE we didn't save anything yet, but
1092     // our objective is the same. We have to ensure that the values in the team
1093     // are the same as those we have.
1094     // So, this code achieves what we need whether or not t_fp_control_saved is
1095     // true. By checking whether the value needs updating we avoid unnecessary
1096     // writes that would put the cache-line into a written state, causing all
1097     // threads in the team to have to read it again.
1098     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1099     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1100     // Although we don't use this value, other code in the runtime wants to know
1101     // whether it should restore them. So we must ensure it is correct.
1102     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1103   } else {
1104     // Similarly here. Don't write to this cache-line in the team structure
1105     // unless we have to.
1106     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1107   }
1108 }
1109 
1110 // Do the opposite, setting the hardware registers to the updated values from
1111 // the team.
1112 inline static void updateHWFPControl(kmp_team_t *team) {
1113   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1114     // Only reset the fp control regs if they have been changed in the team.
1115     // the parallel region that we are exiting.
1116     kmp_int16 x87_fpu_control_word;
1117     kmp_uint32 mxcsr;
1118     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1119     __kmp_store_mxcsr(&mxcsr);
1120     mxcsr &= KMP_X86_MXCSR_MASK;
1121 
1122     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1123       __kmp_clear_x87_fpu_status_word();
1124       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1125     }
1126 
1127     if (team->t.t_mxcsr != mxcsr) {
1128       __kmp_load_mxcsr(&team->t.t_mxcsr);
1129     }
1130   }
1131 }
1132 #else
1133 #define propagateFPControl(x) ((void)0)
1134 #define updateHWFPControl(x) ((void)0)
1135 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1136 
1137 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1138                                      int realloc); // forward declaration
1139 
1140 /* Run a parallel region that has been serialized, so runs only in a team of the
1141    single master thread. */
1142 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1143   kmp_info_t *this_thr;
1144   kmp_team_t *serial_team;
1145 
1146   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1147 
1148   /* Skip all this code for autopar serialized loops since it results in
1149      unacceptable overhead */
1150   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1151     return;
1152 
1153   if (!TCR_4(__kmp_init_parallel))
1154     __kmp_parallel_initialize();
1155   __kmp_resume_if_soft_paused();
1156 
1157   this_thr = __kmp_threads[global_tid];
1158   serial_team = this_thr->th.th_serial_team;
1159 
1160   /* utilize the serialized team held by this thread */
1161   KMP_DEBUG_ASSERT(serial_team);
1162   KMP_MB();
1163 
1164   if (__kmp_tasking_mode != tskm_immediate_exec) {
1165     KMP_DEBUG_ASSERT(
1166         this_thr->th.th_task_team ==
1167         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1168     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1169                      NULL);
1170     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1171                   "team %p, new task_team = NULL\n",
1172                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1173     this_thr->th.th_task_team = NULL;
1174   }
1175 
1176   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1177   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1178     proc_bind = proc_bind_false;
1179   } else if (proc_bind == proc_bind_default) {
1180     // No proc_bind clause was specified, so use the current value
1181     // of proc-bind-var for this parallel region.
1182     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1183   }
1184   // Reset for next parallel region
1185   this_thr->th.th_set_proc_bind = proc_bind_default;
1186 
1187 #if OMPT_SUPPORT
1188   ompt_data_t ompt_parallel_data = ompt_data_none;
1189   ompt_data_t *implicit_task_data;
1190   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1191   if (ompt_enabled.enabled &&
1192       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1193 
1194     ompt_task_info_t *parent_task_info;
1195     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1196 
1197     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1198     if (ompt_enabled.ompt_callback_parallel_begin) {
1199       int team_size = 1;
1200 
1201       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1202           &(parent_task_info->task_data), &(parent_task_info->frame),
1203           &ompt_parallel_data, team_size,
1204           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1205     }
1206   }
1207 #endif // OMPT_SUPPORT
1208 
1209   if (this_thr->th.th_team != serial_team) {
1210     // Nested level will be an index in the nested nthreads array
1211     int level = this_thr->th.th_team->t.t_level;
1212 
1213     if (serial_team->t.t_serialized) {
1214       /* this serial team was already used
1215          TODO increase performance by making this locks more specific */
1216       kmp_team_t *new_team;
1217 
1218       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1219 
1220       new_team =
1221           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1222 #if OMPT_SUPPORT
1223                               ompt_parallel_data,
1224 #endif
1225                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1226                               0 USE_NESTED_HOT_ARG(NULL));
1227       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1228       KMP_ASSERT(new_team);
1229 
1230       /* setup new serialized team and install it */
1231       new_team->t.t_threads[0] = this_thr;
1232       new_team->t.t_parent = this_thr->th.th_team;
1233       serial_team = new_team;
1234       this_thr->th.th_serial_team = serial_team;
1235 
1236       KF_TRACE(
1237           10,
1238           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1239            global_tid, serial_team));
1240 
1241       /* TODO the above breaks the requirement that if we run out of resources,
1242          then we can still guarantee that serialized teams are ok, since we may
1243          need to allocate a new one */
1244     } else {
1245       KF_TRACE(
1246           10,
1247           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1248            global_tid, serial_team));
1249     }
1250 
1251     /* we have to initialize this serial team */
1252     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1253     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1254     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1255     serial_team->t.t_ident = loc;
1256     serial_team->t.t_serialized = 1;
1257     serial_team->t.t_nproc = 1;
1258     serial_team->t.t_parent = this_thr->th.th_team;
1259     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1260     this_thr->th.th_team = serial_team;
1261     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1262 
1263     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1264                   this_thr->th.th_current_task));
1265     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1266     this_thr->th.th_current_task->td_flags.executing = 0;
1267 
1268     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1269 
1270     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1271        implicit task for each serialized task represented by
1272        team->t.t_serialized? */
1273     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1274               &this_thr->th.th_current_task->td_parent->td_icvs);
1275 
1276     // Thread value exists in the nested nthreads array for the next nested
1277     // level
1278     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1279       this_thr->th.th_current_task->td_icvs.nproc =
1280           __kmp_nested_nth.nth[level + 1];
1281     }
1282 
1283     if (__kmp_nested_proc_bind.used &&
1284         (level + 1 < __kmp_nested_proc_bind.used)) {
1285       this_thr->th.th_current_task->td_icvs.proc_bind =
1286           __kmp_nested_proc_bind.bind_types[level + 1];
1287     }
1288 
1289 #if USE_DEBUGGER
1290     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1291 #endif
1292     this_thr->th.th_info.ds.ds_tid = 0;
1293 
1294     /* set thread cache values */
1295     this_thr->th.th_team_nproc = 1;
1296     this_thr->th.th_team_master = this_thr;
1297     this_thr->th.th_team_serialized = 1;
1298 
1299     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1300     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1301     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1302 
1303     propagateFPControl(serial_team);
1304 
1305     /* check if we need to allocate dispatch buffers stack */
1306     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1307     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1308       serial_team->t.t_dispatch->th_disp_buffer =
1309           (dispatch_private_info_t *)__kmp_allocate(
1310               sizeof(dispatch_private_info_t));
1311     }
1312     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1313 
1314     KMP_MB();
1315 
1316   } else {
1317     /* this serialized team is already being used,
1318      * that's fine, just add another nested level */
1319     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1320     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1321     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1322     ++serial_team->t.t_serialized;
1323     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1324 
1325     // Nested level will be an index in the nested nthreads array
1326     int level = this_thr->th.th_team->t.t_level;
1327     // Thread value exists in the nested nthreads array for the next nested
1328     // level
1329     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1330       this_thr->th.th_current_task->td_icvs.nproc =
1331           __kmp_nested_nth.nth[level + 1];
1332     }
1333     serial_team->t.t_level++;
1334     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1335                   "of serial team %p to %d\n",
1336                   global_tid, serial_team, serial_team->t.t_level));
1337 
1338     /* allocate/push dispatch buffers stack */
1339     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1340     {
1341       dispatch_private_info_t *disp_buffer =
1342           (dispatch_private_info_t *)__kmp_allocate(
1343               sizeof(dispatch_private_info_t));
1344       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1345       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1346     }
1347     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1348 
1349     KMP_MB();
1350   }
1351   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1352 
1353   // Perform the display affinity functionality for
1354   // serialized parallel regions
1355   if (__kmp_display_affinity) {
1356     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1357         this_thr->th.th_prev_num_threads != 1) {
1358       // NULL means use the affinity-format-var ICV
1359       __kmp_aux_display_affinity(global_tid, NULL);
1360       this_thr->th.th_prev_level = serial_team->t.t_level;
1361       this_thr->th.th_prev_num_threads = 1;
1362     }
1363   }
1364 
1365   if (__kmp_env_consistency_check)
1366     __kmp_push_parallel(global_tid, NULL);
1367 #if OMPT_SUPPORT
1368   serial_team->t.ompt_team_info.master_return_address = codeptr;
1369   if (ompt_enabled.enabled &&
1370       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1371     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1372 
1373     ompt_lw_taskteam_t lw_taskteam;
1374     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1375                             &ompt_parallel_data, codeptr);
1376 
1377     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1378     // don't use lw_taskteam after linking. content was swaped
1379 
1380     /* OMPT implicit task begin */
1381     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1382     if (ompt_enabled.ompt_callback_implicit_task) {
1383       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1384           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1385           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1386       OMPT_CUR_TASK_INFO(this_thr)
1387           ->thread_num = __kmp_tid_from_gtid(global_tid);
1388     }
1389 
1390     /* OMPT state */
1391     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1392     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1393   }
1394 #endif
1395 }
1396 
1397 /* most of the work for a fork */
1398 /* return true if we really went parallel, false if serialized */
1399 int __kmp_fork_call(ident_t *loc, int gtid,
1400                     enum fork_context_e call_context, // Intel, GNU, ...
1401                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1402                     kmp_va_list ap) {
1403   void **argv;
1404   int i;
1405   int master_tid;
1406   int master_this_cons;
1407   kmp_team_t *team;
1408   kmp_team_t *parent_team;
1409   kmp_info_t *master_th;
1410   kmp_root_t *root;
1411   int nthreads;
1412   int master_active;
1413   int master_set_numthreads;
1414   int level;
1415   int active_level;
1416   int teams_level;
1417 #if KMP_NESTED_HOT_TEAMS
1418   kmp_hot_team_ptr_t **p_hot_teams;
1419 #endif
1420   { // KMP_TIME_BLOCK
1421     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1422     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1423 
1424     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1425     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1426       /* Some systems prefer the stack for the root thread(s) to start with */
1427       /* some gap from the parent stack to prevent false sharing. */
1428       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1429       /* These 2 lines below are so this does not get optimized out */
1430       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1431         __kmp_stkpadding += (short)((kmp_int64)dummy);
1432     }
1433 
1434     /* initialize if needed */
1435     KMP_DEBUG_ASSERT(
1436         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1437     if (!TCR_4(__kmp_init_parallel))
1438       __kmp_parallel_initialize();
1439     __kmp_resume_if_soft_paused();
1440 
1441     /* setup current data */
1442     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1443     // shutdown
1444     parent_team = master_th->th.th_team;
1445     master_tid = master_th->th.th_info.ds.ds_tid;
1446     master_this_cons = master_th->th.th_local.this_construct;
1447     root = master_th->th.th_root;
1448     master_active = root->r.r_active;
1449     master_set_numthreads = master_th->th.th_set_nproc;
1450 
1451 #if OMPT_SUPPORT
1452     ompt_data_t ompt_parallel_data = ompt_data_none;
1453     ompt_data_t *parent_task_data;
1454     ompt_frame_t *ompt_frame;
1455     ompt_data_t *implicit_task_data;
1456     void *return_address = NULL;
1457 
1458     if (ompt_enabled.enabled) {
1459       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1460                                     NULL, NULL);
1461       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1462     }
1463 #endif
1464 
1465     // Nested level will be an index in the nested nthreads array
1466     level = parent_team->t.t_level;
1467     // used to launch non-serial teams even if nested is not allowed
1468     active_level = parent_team->t.t_active_level;
1469     // needed to check nesting inside the teams
1470     teams_level = master_th->th.th_teams_level;
1471 #if KMP_NESTED_HOT_TEAMS
1472     p_hot_teams = &master_th->th.th_hot_teams;
1473     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1474       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1475           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1476       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1477       // it is either actual or not needed (when active_level > 0)
1478       (*p_hot_teams)[0].hot_team_nth = 1;
1479     }
1480 #endif
1481 
1482 #if OMPT_SUPPORT
1483     if (ompt_enabled.enabled) {
1484       if (ompt_enabled.ompt_callback_parallel_begin) {
1485         int team_size = master_set_numthreads
1486                             ? master_set_numthreads
1487                             : get__nproc_2(parent_team, master_tid);
1488         int flags = OMPT_INVOKER(call_context) |
1489                     ((microtask == (microtask_t)__kmp_teams_master)
1490                          ? ompt_parallel_league
1491                          : ompt_parallel_team);
1492         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1493             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1494             return_address);
1495       }
1496       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1497     }
1498 #endif
1499 
1500     master_th->th.th_ident = loc;
1501 
1502     if (master_th->th.th_teams_microtask && ap &&
1503         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1504       // AC: This is start of parallel that is nested inside teams construct.
1505       // The team is actual (hot), all workers are ready at the fork barrier.
1506       // No lock needed to initialize the team a bit, then free workers.
1507       parent_team->t.t_ident = loc;
1508       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1509       parent_team->t.t_argc = argc;
1510       argv = (void **)parent_team->t.t_argv;
1511       for (i = argc - 1; i >= 0; --i)
1512         *argv++ = va_arg(kmp_va_deref(ap), void *);
1513       // Increment our nested depth levels, but not increase the serialization
1514       if (parent_team == master_th->th.th_serial_team) {
1515         // AC: we are in serialized parallel
1516         __kmpc_serialized_parallel(loc, gtid);
1517         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1518 
1519         if (call_context == fork_context_gnu) {
1520           // AC: need to decrement t_serialized for enquiry functions to work
1521           // correctly, will restore at join time
1522           parent_team->t.t_serialized--;
1523           return TRUE;
1524         }
1525 
1526 #if OMPT_SUPPORT
1527         void *dummy;
1528         void **exit_frame_p;
1529 
1530         ompt_lw_taskteam_t lw_taskteam;
1531 
1532         if (ompt_enabled.enabled) {
1533           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1534                                   &ompt_parallel_data, return_address);
1535           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1536 
1537           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1538           // don't use lw_taskteam after linking. content was swaped
1539 
1540           /* OMPT implicit task begin */
1541           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1542           if (ompt_enabled.ompt_callback_implicit_task) {
1543             OMPT_CUR_TASK_INFO(master_th)
1544                 ->thread_num = __kmp_tid_from_gtid(gtid);
1545             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1546                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1547                 implicit_task_data, 1,
1548                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1549           }
1550 
1551           /* OMPT state */
1552           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1553         } else {
1554           exit_frame_p = &dummy;
1555         }
1556 #endif
1557         // AC: need to decrement t_serialized for enquiry functions to work
1558         // correctly, will restore at join time
1559         parent_team->t.t_serialized--;
1560 
1561         {
1562           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1563           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1564           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1565 #if OMPT_SUPPORT
1566                                  ,
1567                                  exit_frame_p
1568 #endif
1569                                  );
1570         }
1571 
1572 #if OMPT_SUPPORT
1573         if (ompt_enabled.enabled) {
1574           *exit_frame_p = NULL;
1575           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1576           if (ompt_enabled.ompt_callback_implicit_task) {
1577             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1578                 ompt_scope_end, NULL, implicit_task_data, 1,
1579                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1580           }
1581           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1582           __ompt_lw_taskteam_unlink(master_th);
1583           if (ompt_enabled.ompt_callback_parallel_end) {
1584             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1585                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1586                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1587                 return_address);
1588           }
1589           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1590         }
1591 #endif
1592         return TRUE;
1593       }
1594 
1595       parent_team->t.t_pkfn = microtask;
1596       parent_team->t.t_invoke = invoker;
1597       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1598       parent_team->t.t_active_level++;
1599       parent_team->t.t_level++;
1600       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1601 
1602 #if OMPT_SUPPORT
1603       if (ompt_enabled.enabled) {
1604         ompt_lw_taskteam_t lw_taskteam;
1605         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1606                                 &ompt_parallel_data, return_address);
1607         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1608       }
1609 #endif
1610 
1611       /* Change number of threads in the team if requested */
1612       if (master_set_numthreads) { // The parallel has num_threads clause
1613         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1614           // AC: only can reduce number of threads dynamically, can't increase
1615           kmp_info_t **other_threads = parent_team->t.t_threads;
1616           parent_team->t.t_nproc = master_set_numthreads;
1617           for (i = 0; i < master_set_numthreads; ++i) {
1618             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1619           }
1620           // Keep extra threads hot in the team for possible next parallels
1621         }
1622         master_th->th.th_set_nproc = 0;
1623       }
1624 
1625 #if USE_DEBUGGER
1626       if (__kmp_debugging) { // Let debugger override number of threads.
1627         int nth = __kmp_omp_num_threads(loc);
1628         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1629           master_set_numthreads = nth;
1630         }
1631       }
1632 #endif
1633 
1634 #if USE_ITT_BUILD
1635       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1636            KMP_ITT_DEBUG) &&
1637           __kmp_forkjoin_frames_mode == 3 &&
1638           parent_team->t.t_active_level == 1 // only report frames at level 1
1639           && master_th->th.th_teams_size.nteams == 1) {
1640         kmp_uint64 tmp_time = __itt_get_timestamp();
1641         master_th->th.th_frame_time = tmp_time;
1642         parent_team->t.t_region_time = tmp_time;
1643       }
1644       if (__itt_stack_caller_create_ptr) {
1645         // create new stack stitching id before entering fork barrier
1646         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1647       }
1648 #endif /* USE_ITT_BUILD */
1649 
1650       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1651                     "master_th=%p, gtid=%d\n",
1652                     root, parent_team, master_th, gtid));
1653       __kmp_internal_fork(loc, gtid, parent_team);
1654       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1655                     "master_th=%p, gtid=%d\n",
1656                     root, parent_team, master_th, gtid));
1657 
1658       if (call_context == fork_context_gnu)
1659         return TRUE;
1660 
1661       /* Invoke microtask for MASTER thread */
1662       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1663                     parent_team->t.t_id, parent_team->t.t_pkfn));
1664 
1665       if (!parent_team->t.t_invoke(gtid)) {
1666         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1667       }
1668       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1669                     parent_team->t.t_id, parent_team->t.t_pkfn));
1670       KMP_MB(); /* Flush all pending memory write invalidates.  */
1671 
1672       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1673 
1674       return TRUE;
1675     } // Parallel closely nested in teams construct
1676 
1677 #if KMP_DEBUG
1678     if (__kmp_tasking_mode != tskm_immediate_exec) {
1679       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1680                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1681     }
1682 #endif
1683 
1684     if (parent_team->t.t_active_level >=
1685         master_th->th.th_current_task->td_icvs.max_active_levels) {
1686       nthreads = 1;
1687     } else {
1688       int enter_teams = ((ap == NULL && active_level == 0) ||
1689                          (ap && teams_level > 0 && teams_level == level));
1690       nthreads =
1691           master_set_numthreads
1692               ? master_set_numthreads
1693               : get__nproc_2(
1694                     parent_team,
1695                     master_tid); // TODO: get nproc directly from current task
1696 
1697       // Check if we need to take forkjoin lock? (no need for serialized
1698       // parallel out of teams construct). This code moved here from
1699       // __kmp_reserve_threads() to speedup nested serialized parallels.
1700       if (nthreads > 1) {
1701         if ((get__max_active_levels(master_th) == 1 &&
1702              (root->r.r_in_parallel && !enter_teams)) ||
1703             (__kmp_library == library_serial)) {
1704           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1705                         " threads\n",
1706                         gtid, nthreads));
1707           nthreads = 1;
1708         }
1709       }
1710       if (nthreads > 1) {
1711         /* determine how many new threads we can use */
1712         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1713         /* AC: If we execute teams from parallel region (on host), then teams
1714            should be created but each can only have 1 thread if nesting is
1715            disabled. If teams called from serial region, then teams and their
1716            threads should be created regardless of the nesting setting. */
1717         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1718                                          nthreads, enter_teams);
1719         if (nthreads == 1) {
1720           // Free lock for single thread execution here; for multi-thread
1721           // execution it will be freed later after team of threads created
1722           // and initialized
1723           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1724         }
1725       }
1726     }
1727     KMP_DEBUG_ASSERT(nthreads > 0);
1728 
1729     // If we temporarily changed the set number of threads then restore it now
1730     master_th->th.th_set_nproc = 0;
1731 
1732     /* create a serialized parallel region? */
1733     if (nthreads == 1) {
1734 /* josh todo: hypothetical question: what do we do for OS X*? */
1735 #if KMP_OS_LINUX &&                                                            \
1736     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1737       void *args[argc];
1738 #else
1739       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1740 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1741           KMP_ARCH_AARCH64) */
1742 
1743       KA_TRACE(20,
1744                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1745 
1746       __kmpc_serialized_parallel(loc, gtid);
1747 
1748       if (call_context == fork_context_intel) {
1749         /* TODO this sucks, use the compiler itself to pass args! :) */
1750         master_th->th.th_serial_team->t.t_ident = loc;
1751         if (!ap) {
1752           // revert change made in __kmpc_serialized_parallel()
1753           master_th->th.th_serial_team->t.t_level--;
1754 // Get args from parent team for teams construct
1755 
1756 #if OMPT_SUPPORT
1757           void *dummy;
1758           void **exit_frame_p;
1759           ompt_task_info_t *task_info;
1760 
1761           ompt_lw_taskteam_t lw_taskteam;
1762 
1763           if (ompt_enabled.enabled) {
1764             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1765                                     &ompt_parallel_data, return_address);
1766 
1767             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1768             // don't use lw_taskteam after linking. content was swaped
1769 
1770             task_info = OMPT_CUR_TASK_INFO(master_th);
1771             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1772             if (ompt_enabled.ompt_callback_implicit_task) {
1773               OMPT_CUR_TASK_INFO(master_th)
1774                   ->thread_num = __kmp_tid_from_gtid(gtid);
1775               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1776                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1777                   &(task_info->task_data), 1,
1778                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1779                   ompt_task_implicit);
1780             }
1781 
1782             /* OMPT state */
1783             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1784           } else {
1785             exit_frame_p = &dummy;
1786           }
1787 #endif
1788 
1789           {
1790             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1791             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1792             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1793                                    parent_team->t.t_argv
1794 #if OMPT_SUPPORT
1795                                    ,
1796                                    exit_frame_p
1797 #endif
1798                                    );
1799           }
1800 
1801 #if OMPT_SUPPORT
1802           if (ompt_enabled.enabled) {
1803             *exit_frame_p = NULL;
1804             if (ompt_enabled.ompt_callback_implicit_task) {
1805               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1806                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1807                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1808                   ompt_task_implicit);
1809             }
1810             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1811             __ompt_lw_taskteam_unlink(master_th);
1812             if (ompt_enabled.ompt_callback_parallel_end) {
1813               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1814                   &ompt_parallel_data, parent_task_data,
1815                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1816                   return_address);
1817             }
1818             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1819           }
1820 #endif
1821         } else if (microtask == (microtask_t)__kmp_teams_master) {
1822           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1823                            master_th->th.th_serial_team);
1824           team = master_th->th.th_team;
1825           // team->t.t_pkfn = microtask;
1826           team->t.t_invoke = invoker;
1827           __kmp_alloc_argv_entries(argc, team, TRUE);
1828           team->t.t_argc = argc;
1829           argv = (void **)team->t.t_argv;
1830           if (ap) {
1831             for (i = argc - 1; i >= 0; --i)
1832               *argv++ = va_arg(kmp_va_deref(ap), void *);
1833           } else {
1834             for (i = 0; i < argc; ++i)
1835               // Get args from parent team for teams construct
1836               argv[i] = parent_team->t.t_argv[i];
1837           }
1838           // AC: revert change made in __kmpc_serialized_parallel()
1839           //     because initial code in teams should have level=0
1840           team->t.t_level--;
1841           // AC: call special invoker for outer "parallel" of teams construct
1842           invoker(gtid);
1843 #if OMPT_SUPPORT
1844           if (ompt_enabled.enabled) {
1845             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1846             if (ompt_enabled.ompt_callback_implicit_task) {
1847               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1848                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1849                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1850             }
1851             if (ompt_enabled.ompt_callback_parallel_end) {
1852               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1853                   &ompt_parallel_data, parent_task_data,
1854                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1855                   return_address);
1856             }
1857             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1858           }
1859 #endif
1860         } else {
1861           argv = args;
1862           for (i = argc - 1; i >= 0; --i)
1863             *argv++ = va_arg(kmp_va_deref(ap), void *);
1864           KMP_MB();
1865 
1866 #if OMPT_SUPPORT
1867           void *dummy;
1868           void **exit_frame_p;
1869           ompt_task_info_t *task_info;
1870 
1871           ompt_lw_taskteam_t lw_taskteam;
1872 
1873           if (ompt_enabled.enabled) {
1874             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1875                                     &ompt_parallel_data, return_address);
1876             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1877             // don't use lw_taskteam after linking. content was swaped
1878             task_info = OMPT_CUR_TASK_INFO(master_th);
1879             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1880 
1881             /* OMPT implicit task begin */
1882             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1883             if (ompt_enabled.ompt_callback_implicit_task) {
1884               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1885                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1886                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1887                   ompt_task_implicit);
1888               OMPT_CUR_TASK_INFO(master_th)
1889                   ->thread_num = __kmp_tid_from_gtid(gtid);
1890             }
1891 
1892             /* OMPT state */
1893             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1894           } else {
1895             exit_frame_p = &dummy;
1896           }
1897 #endif
1898 
1899           {
1900             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1901             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1902             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1903 #if OMPT_SUPPORT
1904                                    ,
1905                                    exit_frame_p
1906 #endif
1907                                    );
1908           }
1909 
1910 #if OMPT_SUPPORT
1911           if (ompt_enabled.enabled) {
1912             *exit_frame_p = NULL;
1913             if (ompt_enabled.ompt_callback_implicit_task) {
1914               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1915                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1916                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1917                   ompt_task_implicit);
1918             }
1919 
1920             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1921             __ompt_lw_taskteam_unlink(master_th);
1922             if (ompt_enabled.ompt_callback_parallel_end) {
1923               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1924                   &ompt_parallel_data, parent_task_data,
1925                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1926                   return_address);
1927             }
1928             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1929           }
1930 #endif
1931         }
1932       } else if (call_context == fork_context_gnu) {
1933 #if OMPT_SUPPORT
1934         ompt_lw_taskteam_t lwt;
1935         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1936                                 return_address);
1937 
1938         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1939         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1940 // don't use lw_taskteam after linking. content was swaped
1941 #endif
1942 
1943         // we were called from GNU native code
1944         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1945         return FALSE;
1946       } else {
1947         KMP_ASSERT2(call_context < fork_context_last,
1948                     "__kmp_fork_call: unknown fork_context parameter");
1949       }
1950 
1951       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1952       KMP_MB();
1953       return FALSE;
1954     } // if (nthreads == 1)
1955 
1956     // GEH: only modify the executing flag in the case when not serialized
1957     //      serialized case is handled in kmpc_serialized_parallel
1958     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1959                   "curtask=%p, curtask_max_aclevel=%d\n",
1960                   parent_team->t.t_active_level, master_th,
1961                   master_th->th.th_current_task,
1962                   master_th->th.th_current_task->td_icvs.max_active_levels));
1963     // TODO: GEH - cannot do this assertion because root thread not set up as
1964     // executing
1965     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1966     master_th->th.th_current_task->td_flags.executing = 0;
1967 
1968     if (!master_th->th.th_teams_microtask || level > teams_level) {
1969       /* Increment our nested depth level */
1970       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1971     }
1972 
1973     // See if we need to make a copy of the ICVs.
1974     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1975     if ((level + 1 < __kmp_nested_nth.used) &&
1976         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1977       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1978     } else {
1979       nthreads_icv = 0; // don't update
1980     }
1981 
1982     // Figure out the proc_bind_policy for the new team.
1983     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1984     kmp_proc_bind_t proc_bind_icv =
1985         proc_bind_default; // proc_bind_default means don't update
1986     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1987       proc_bind = proc_bind_false;
1988     } else {
1989       if (proc_bind == proc_bind_default) {
1990         // No proc_bind clause specified; use current proc-bind-var for this
1991         // parallel region
1992         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1993       }
1994       /* else: The proc_bind policy was specified explicitly on parallel clause.
1995          This overrides proc-bind-var for this parallel region, but does not
1996          change proc-bind-var. */
1997       // Figure the value of proc-bind-var for the child threads.
1998       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1999           (__kmp_nested_proc_bind.bind_types[level + 1] !=
2000            master_th->th.th_current_task->td_icvs.proc_bind)) {
2001         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2002       }
2003     }
2004 
2005     // Reset for next parallel region
2006     master_th->th.th_set_proc_bind = proc_bind_default;
2007 
2008     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2009       kmp_internal_control_t new_icvs;
2010       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2011       new_icvs.next = NULL;
2012       if (nthreads_icv > 0) {
2013         new_icvs.nproc = nthreads_icv;
2014       }
2015       if (proc_bind_icv != proc_bind_default) {
2016         new_icvs.proc_bind = proc_bind_icv;
2017       }
2018 
2019       /* allocate a new parallel team */
2020       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2021       team = __kmp_allocate_team(root, nthreads, nthreads,
2022 #if OMPT_SUPPORT
2023                                  ompt_parallel_data,
2024 #endif
2025                                  proc_bind, &new_icvs,
2026                                  argc USE_NESTED_HOT_ARG(master_th));
2027     } else {
2028       /* allocate a new parallel team */
2029       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2030       team = __kmp_allocate_team(root, nthreads, nthreads,
2031 #if OMPT_SUPPORT
2032                                  ompt_parallel_data,
2033 #endif
2034                                  proc_bind,
2035                                  &master_th->th.th_current_task->td_icvs,
2036                                  argc USE_NESTED_HOT_ARG(master_th));
2037     }
2038     KF_TRACE(
2039         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2040 
2041     /* setup the new team */
2042     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2043     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2044     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2045     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2046     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2047 #if OMPT_SUPPORT
2048     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2049                           return_address);
2050 #endif
2051     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2052     // TODO: parent_team->t.t_level == INT_MAX ???
2053     if (!master_th->th.th_teams_microtask || level > teams_level) {
2054       int new_level = parent_team->t.t_level + 1;
2055       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2056       new_level = parent_team->t.t_active_level + 1;
2057       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2058     } else {
2059       // AC: Do not increase parallel level at start of the teams construct
2060       int new_level = parent_team->t.t_level;
2061       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2062       new_level = parent_team->t.t_active_level;
2063       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2064     }
2065     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2066     // set master's schedule as new run-time schedule
2067     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2068 
2069     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2070     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2071 
2072     // Update the floating point rounding in the team if required.
2073     propagateFPControl(team);
2074 
2075     if (__kmp_tasking_mode != tskm_immediate_exec) {
2076       // Set master's task team to team's task team. Unless this is hot team, it
2077       // should be NULL.
2078       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2079                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2080       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2081                     "%p, new task_team %p / team %p\n",
2082                     __kmp_gtid_from_thread(master_th),
2083                     master_th->th.th_task_team, parent_team,
2084                     team->t.t_task_team[master_th->th.th_task_state], team));
2085 
2086       if (active_level || master_th->th.th_task_team) {
2087         // Take a memo of master's task_state
2088         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2089         if (master_th->th.th_task_state_top >=
2090             master_th->th.th_task_state_stack_sz) { // increase size
2091           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2092           kmp_uint8 *old_stack, *new_stack;
2093           kmp_uint32 i;
2094           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2095           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2096             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2097           }
2098           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2099                ++i) { // zero-init rest of stack
2100             new_stack[i] = 0;
2101           }
2102           old_stack = master_th->th.th_task_state_memo_stack;
2103           master_th->th.th_task_state_memo_stack = new_stack;
2104           master_th->th.th_task_state_stack_sz = new_size;
2105           __kmp_free(old_stack);
2106         }
2107         // Store master's task_state on stack
2108         master_th->th
2109             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2110             master_th->th.th_task_state;
2111         master_th->th.th_task_state_top++;
2112 #if KMP_NESTED_HOT_TEAMS
2113         if (master_th->th.th_hot_teams &&
2114             active_level < __kmp_hot_teams_max_level &&
2115             team == master_th->th.th_hot_teams[active_level].hot_team) {
2116           // Restore master's nested state if nested hot team
2117           master_th->th.th_task_state =
2118               master_th->th
2119                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2120         } else {
2121 #endif
2122           master_th->th.th_task_state = 0;
2123 #if KMP_NESTED_HOT_TEAMS
2124         }
2125 #endif
2126       }
2127 #if !KMP_NESTED_HOT_TEAMS
2128       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2129                        (team == root->r.r_hot_team));
2130 #endif
2131     }
2132 
2133     KA_TRACE(
2134         20,
2135         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2136          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2137          team->t.t_nproc));
2138     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2139                      (team->t.t_master_tid == 0 &&
2140                       (team->t.t_parent == root->r.r_root_team ||
2141                        team->t.t_parent->t.t_serialized)));
2142     KMP_MB();
2143 
2144     /* now, setup the arguments */
2145     argv = (void **)team->t.t_argv;
2146     if (ap) {
2147       for (i = argc - 1; i >= 0; --i) {
2148         void *new_argv = va_arg(kmp_va_deref(ap), void *);
2149         KMP_CHECK_UPDATE(*argv, new_argv);
2150         argv++;
2151       }
2152     } else {
2153       for (i = 0; i < argc; ++i) {
2154         // Get args from parent team for teams construct
2155         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2156       }
2157     }
2158 
2159     /* now actually fork the threads */
2160     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2161     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2162       root->r.r_active = TRUE;
2163 
2164     __kmp_fork_team_threads(root, team, master_th, gtid);
2165     __kmp_setup_icv_copy(team, nthreads,
2166                          &master_th->th.th_current_task->td_icvs, loc);
2167 
2168 #if OMPT_SUPPORT
2169     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2170 #endif
2171 
2172     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2173 
2174 #if USE_ITT_BUILD
2175     if (team->t.t_active_level == 1 // only report frames at level 1
2176         && !master_th->th.th_teams_microtask) { // not in teams construct
2177 #if USE_ITT_NOTIFY
2178       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2179           (__kmp_forkjoin_frames_mode == 3 ||
2180            __kmp_forkjoin_frames_mode == 1)) {
2181         kmp_uint64 tmp_time = 0;
2182         if (__itt_get_timestamp_ptr)
2183           tmp_time = __itt_get_timestamp();
2184         // Internal fork - report frame begin
2185         master_th->th.th_frame_time = tmp_time;
2186         if (__kmp_forkjoin_frames_mode == 3)
2187           team->t.t_region_time = tmp_time;
2188       } else
2189 // only one notification scheme (either "submit" or "forking/joined", not both)
2190 #endif /* USE_ITT_NOTIFY */
2191           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2192               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2193         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2194         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2195       }
2196     }
2197 #endif /* USE_ITT_BUILD */
2198 
2199     /* now go on and do the work */
2200     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2201     KMP_MB();
2202     KF_TRACE(10,
2203              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2204               root, team, master_th, gtid));
2205 
2206 #if USE_ITT_BUILD
2207     if (__itt_stack_caller_create_ptr) {
2208       team->t.t_stack_id =
2209           __kmp_itt_stack_caller_create(); // create new stack stitching id
2210       // before entering fork barrier
2211     }
2212 #endif /* USE_ITT_BUILD */
2213 
2214     // AC: skip __kmp_internal_fork at teams construct, let only master
2215     // threads execute
2216     if (ap) {
2217       __kmp_internal_fork(loc, gtid, team);
2218       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2219                     "master_th=%p, gtid=%d\n",
2220                     root, team, master_th, gtid));
2221     }
2222 
2223     if (call_context == fork_context_gnu) {
2224       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2225       return TRUE;
2226     }
2227 
2228     /* Invoke microtask for MASTER thread */
2229     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2230                   team->t.t_id, team->t.t_pkfn));
2231   } // END of timer KMP_fork_call block
2232 
2233 #if KMP_STATS_ENABLED
2234   // If beginning a teams construct, then change thread state
2235   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2236   if (!ap) {
2237     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2238   }
2239 #endif
2240 
2241   if (!team->t.t_invoke(gtid)) {
2242     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2243   }
2244 
2245 #if KMP_STATS_ENABLED
2246   // If was beginning of a teams construct, then reset thread state
2247   if (!ap) {
2248     KMP_SET_THREAD_STATE(previous_state);
2249   }
2250 #endif
2251 
2252   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253                 team->t.t_id, team->t.t_pkfn));
2254   KMP_MB(); /* Flush all pending memory write invalidates.  */
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259   if (ompt_enabled.enabled) {
2260     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2261   }
2262 #endif
2263 
2264   return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269                                             kmp_team_t *team) {
2270   // restore state outside the region
2271   thread->th.ompt_thread_info.state =
2272       ((team->t.t_serialized) ? ompt_state_work_serial
2273                               : ompt_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277                                    kmp_team_t *team, ompt_data_t *parallel_data,
2278                                    int flags, void *codeptr) {
2279   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280   if (ompt_enabled.ompt_callback_parallel_end) {
2281     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282         parallel_data, &(task_info->task_data), flags, codeptr);
2283   }
2284 
2285   task_info->frame.enter_frame = ompt_data_none;
2286   __kmp_join_restore_state(thread, team);
2287 }
2288 #endif
2289 
2290 void __kmp_join_call(ident_t *loc, int gtid
2291 #if OMPT_SUPPORT
2292                      ,
2293                      enum fork_context_e fork_context
2294 #endif
2295                      ,
2296                      int exit_teams) {
2297   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2298   kmp_team_t *team;
2299   kmp_team_t *parent_team;
2300   kmp_info_t *master_th;
2301   kmp_root_t *root;
2302   int master_active;
2303 
2304   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2305 
2306   /* setup current data */
2307   master_th = __kmp_threads[gtid];
2308   root = master_th->th.th_root;
2309   team = master_th->th.th_team;
2310   parent_team = team->t.t_parent;
2311 
2312   master_th->th.th_ident = loc;
2313 
2314 #if OMPT_SUPPORT
2315   void *team_microtask = (void *)team->t.t_pkfn;
2316   // For GOMP interface with serialized parallel, need the
2317   // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2318   // and end-parallel events.
2319   if (ompt_enabled.enabled &&
2320       !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2321     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2322   }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328                   "th_task_team = %p\n",
2329                   __kmp_gtid_from_thread(master_th), team,
2330                   team->t.t_task_team[master_th->th.th_task_state],
2331                   master_th->th.th_task_team));
2332     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333                      team->t.t_task_team[master_th->th.th_task_state]);
2334   }
2335 #endif
2336 
2337   if (team->t.t_serialized) {
2338     if (master_th->th.th_teams_microtask) {
2339       // We are in teams construct
2340       int level = team->t.t_level;
2341       int tlevel = master_th->th.th_teams_level;
2342       if (level == tlevel) {
2343         // AC: we haven't incremented it earlier at start of teams construct,
2344         //     so do it here - at the end of teams construct
2345         team->t.t_level++;
2346       } else if (level == tlevel + 1) {
2347         // AC: we are exiting parallel inside teams, need to increment
2348         // serialization in order to restore it in the next call to
2349         // __kmpc_end_serialized_parallel
2350         team->t.t_serialized++;
2351       }
2352     }
2353     __kmpc_end_serialized_parallel(loc, gtid);
2354 
2355 #if OMPT_SUPPORT
2356     if (ompt_enabled.enabled) {
2357       __kmp_join_restore_state(master_th, parent_team);
2358     }
2359 #endif
2360 
2361     return;
2362   }
2363 
2364   master_active = team->t.t_master_active;
2365 
2366   if (!exit_teams) {
2367     // AC: No barrier for internal teams at exit from teams construct.
2368     //     But there is barrier for external team (league).
2369     __kmp_internal_join(loc, gtid, team);
2370   } else {
2371     master_th->th.th_task_state =
2372         0; // AC: no tasking in teams (out of any parallel)
2373   }
2374 
2375   KMP_MB();
2376 
2377 #if OMPT_SUPPORT
2378   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2379   void *codeptr = team->t.ompt_team_info.master_return_address;
2380 #endif
2381 
2382 #if USE_ITT_BUILD
2383   if (__itt_stack_caller_create_ptr) {
2384     // destroy the stack stitching id after join barrier
2385     __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2386   }
2387   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2388   if (team->t.t_active_level == 1 &&
2389       (!master_th->th.th_teams_microtask || /* not in teams construct */
2390        master_th->th.th_teams_size.nteams == 1)) {
2391     master_th->th.th_ident = loc;
2392     // only one notification scheme (either "submit" or "forking/joined", not
2393     // both)
2394     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2395         __kmp_forkjoin_frames_mode == 3)
2396       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2397                              master_th->th.th_frame_time, 0, loc,
2398                              master_th->th.th_team_nproc, 1);
2399     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2400              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2401       __kmp_itt_region_joined(gtid);
2402   } // active_level == 1
2403 #endif /* USE_ITT_BUILD */
2404 
2405   if (master_th->th.th_teams_microtask && !exit_teams &&
2406       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2407       team->t.t_level == master_th->th.th_teams_level + 1) {
2408 // AC: We need to leave the team structure intact at the end of parallel
2409 // inside the teams construct, so that at the next parallel same (hot) team
2410 // works, only adjust nesting levels
2411 #if OMPT_SUPPORT
2412     ompt_data_t ompt_parallel_data = ompt_data_none;
2413     if (ompt_enabled.enabled) {
2414       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2415       if (ompt_enabled.ompt_callback_implicit_task) {
2416         int ompt_team_size = team->t.t_nproc;
2417         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2418             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2419             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2420       }
2421       task_info->frame.exit_frame = ompt_data_none;
2422       task_info->task_data = ompt_data_none;
2423       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2424       __ompt_lw_taskteam_unlink(master_th);
2425     }
2426 #endif
2427     /* Decrement our nested depth level */
2428     team->t.t_level--;
2429     team->t.t_active_level--;
2430     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2431 
2432     // Restore number of threads in the team if needed. This code relies on
2433     // the proper adjustment of th_teams_size.nth after the fork in
2434     // __kmp_teams_master on each teams master in the case that
2435     // __kmp_reserve_threads reduced it.
2436     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2437       int old_num = master_th->th.th_team_nproc;
2438       int new_num = master_th->th.th_teams_size.nth;
2439       kmp_info_t **other_threads = team->t.t_threads;
2440       team->t.t_nproc = new_num;
2441       for (int i = 0; i < old_num; ++i) {
2442         other_threads[i]->th.th_team_nproc = new_num;
2443       }
2444       // Adjust states of non-used threads of the team
2445       for (int i = old_num; i < new_num; ++i) {
2446         // Re-initialize thread's barrier data.
2447         KMP_DEBUG_ASSERT(other_threads[i]);
2448         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2449         for (int b = 0; b < bs_last_barrier; ++b) {
2450           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2451           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2452 #if USE_DEBUGGER
2453           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2454 #endif
2455         }
2456         if (__kmp_tasking_mode != tskm_immediate_exec) {
2457           // Synchronize thread's task state
2458           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2459         }
2460       }
2461     }
2462 
2463 #if OMPT_SUPPORT
2464     if (ompt_enabled.enabled) {
2465       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2466                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2467     }
2468 #endif
2469 
2470     return;
2471   }
2472 
2473   /* do cleanup and restore the parent team */
2474   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2475   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2476 
2477   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2478 
2479   /* jc: The following lock has instructions with REL and ACQ semantics,
2480      separating the parallel user code called in this parallel region
2481      from the serial user code called after this function returns. */
2482   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2483 
2484   if (!master_th->th.th_teams_microtask ||
2485       team->t.t_level > master_th->th.th_teams_level) {
2486     /* Decrement our nested depth level */
2487     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2488   }
2489   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2490 
2491 #if OMPT_SUPPORT
2492   if (ompt_enabled.enabled) {
2493     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2494     if (ompt_enabled.ompt_callback_implicit_task) {
2495       int flags = (team_microtask == (void *)__kmp_teams_master)
2496                       ? ompt_task_initial
2497                       : ompt_task_implicit;
2498       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2499       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2500           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2501           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2502     }
2503     task_info->frame.exit_frame = ompt_data_none;
2504     task_info->task_data = ompt_data_none;
2505   }
2506 #endif
2507 
2508   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2509                 master_th, team));
2510   __kmp_pop_current_task_from_thread(master_th);
2511 
2512 #if KMP_AFFINITY_SUPPORTED
2513   // Restore master thread's partition.
2514   master_th->th.th_first_place = team->t.t_first_place;
2515   master_th->th.th_last_place = team->t.t_last_place;
2516 #endif // KMP_AFFINITY_SUPPORTED
2517   master_th->th.th_def_allocator = team->t.t_def_allocator;
2518 
2519   updateHWFPControl(team);
2520 
2521   if (root->r.r_active != master_active)
2522     root->r.r_active = master_active;
2523 
2524   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2525                             master_th)); // this will free worker threads
2526 
2527   /* this race was fun to find. make sure the following is in the critical
2528      region otherwise assertions may fail occasionally since the old team may be
2529      reallocated and the hierarchy appears inconsistent. it is actually safe to
2530      run and won't cause any bugs, but will cause those assertion failures. it's
2531      only one deref&assign so might as well put this in the critical region */
2532   master_th->th.th_team = parent_team;
2533   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2534   master_th->th.th_team_master = parent_team->t.t_threads[0];
2535   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2536 
2537   /* restore serialized team, if need be */
2538   if (parent_team->t.t_serialized &&
2539       parent_team != master_th->th.th_serial_team &&
2540       parent_team != root->r.r_root_team) {
2541     __kmp_free_team(root,
2542                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2543     master_th->th.th_serial_team = parent_team;
2544   }
2545 
2546   if (__kmp_tasking_mode != tskm_immediate_exec) {
2547     if (master_th->th.th_task_state_top >
2548         0) { // Restore task state from memo stack
2549       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2550       // Remember master's state if we re-use this nested hot team
2551       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2552           master_th->th.th_task_state;
2553       --master_th->th.th_task_state_top; // pop
2554       // Now restore state at this level
2555       master_th->th.th_task_state =
2556           master_th->th
2557               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2558     }
2559     // Copy the task team from the parent team to the master thread
2560     master_th->th.th_task_team =
2561         parent_team->t.t_task_team[master_th->th.th_task_state];
2562     KA_TRACE(20,
2563              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2564               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2565               parent_team));
2566   }
2567 
2568   // TODO: GEH - cannot do this assertion because root thread not set up as
2569   // executing
2570   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2571   master_th->th.th_current_task->td_flags.executing = 1;
2572 
2573   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2574 
2575 #if OMPT_SUPPORT
2576   int flags =
2577       OMPT_INVOKER(fork_context) |
2578       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2579                                                       : ompt_parallel_team);
2580   if (ompt_enabled.enabled) {
2581     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2582                     codeptr);
2583   }
2584 #endif
2585 
2586   KMP_MB();
2587   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2588 }
2589 
2590 /* Check whether we should push an internal control record onto the
2591    serial team stack.  If so, do it.  */
2592 void __kmp_save_internal_controls(kmp_info_t *thread) {
2593 
2594   if (thread->th.th_team != thread->th.th_serial_team) {
2595     return;
2596   }
2597   if (thread->th.th_team->t.t_serialized > 1) {
2598     int push = 0;
2599 
2600     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2601       push = 1;
2602     } else {
2603       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2604           thread->th.th_team->t.t_serialized) {
2605         push = 1;
2606       }
2607     }
2608     if (push) { /* push a record on the serial team's stack */
2609       kmp_internal_control_t *control =
2610           (kmp_internal_control_t *)__kmp_allocate(
2611               sizeof(kmp_internal_control_t));
2612 
2613       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2614 
2615       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2616 
2617       control->next = thread->th.th_team->t.t_control_stack_top;
2618       thread->th.th_team->t.t_control_stack_top = control;
2619     }
2620   }
2621 }
2622 
2623 /* Changes set_nproc */
2624 void __kmp_set_num_threads(int new_nth, int gtid) {
2625   kmp_info_t *thread;
2626   kmp_root_t *root;
2627 
2628   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2629   KMP_DEBUG_ASSERT(__kmp_init_serial);
2630 
2631   if (new_nth < 1)
2632     new_nth = 1;
2633   else if (new_nth > __kmp_max_nth)
2634     new_nth = __kmp_max_nth;
2635 
2636   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2637   thread = __kmp_threads[gtid];
2638   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2639     return; // nothing to do
2640 
2641   __kmp_save_internal_controls(thread);
2642 
2643   set__nproc(thread, new_nth);
2644 
2645   // If this omp_set_num_threads() call will cause the hot team size to be
2646   // reduced (in the absence of a num_threads clause), then reduce it now,
2647   // rather than waiting for the next parallel region.
2648   root = thread->th.th_root;
2649   if (__kmp_init_parallel && (!root->r.r_active) &&
2650       (root->r.r_hot_team->t.t_nproc > new_nth)
2651 #if KMP_NESTED_HOT_TEAMS
2652       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2653 #endif
2654       ) {
2655     kmp_team_t *hot_team = root->r.r_hot_team;
2656     int f;
2657 
2658     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2659 
2660     // Release the extra threads we don't need any more.
2661     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2662       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2663       if (__kmp_tasking_mode != tskm_immediate_exec) {
2664         // When decreasing team size, threads no longer in the team should unref
2665         // task team.
2666         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2667       }
2668       __kmp_free_thread(hot_team->t.t_threads[f]);
2669       hot_team->t.t_threads[f] = NULL;
2670     }
2671     hot_team->t.t_nproc = new_nth;
2672 #if KMP_NESTED_HOT_TEAMS
2673     if (thread->th.th_hot_teams) {
2674       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2675       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2676     }
2677 #endif
2678 
2679     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2680 
2681     // Update the t_nproc field in the threads that are still active.
2682     for (f = 0; f < new_nth; f++) {
2683       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2684       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2685     }
2686     // Special flag in case omp_set_num_threads() call
2687     hot_team->t.t_size_changed = -1;
2688   }
2689 }
2690 
2691 /* Changes max_active_levels */
2692 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2693   kmp_info_t *thread;
2694 
2695   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2696                 "%d = (%d)\n",
2697                 gtid, max_active_levels));
2698   KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700   // validate max_active_levels
2701   if (max_active_levels < 0) {
2702     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2703     // We ignore this call if the user has specified a negative value.
2704     // The current setting won't be changed. The last valid setting will be
2705     // used. A warning will be issued (if warnings are allowed as controlled by
2706     // the KMP_WARNINGS env var).
2707     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2708                   "max_active_levels for thread %d = (%d)\n",
2709                   gtid, max_active_levels));
2710     return;
2711   }
2712   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2713     // it's OK, the max_active_levels is within the valid range: [ 0;
2714     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2715     // We allow a zero value. (implementation defined behavior)
2716   } else {
2717     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2718                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2719     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2720     // Current upper limit is MAX_INT. (implementation defined behavior)
2721     // If the input exceeds the upper limit, we correct the input to be the
2722     // upper limit. (implementation defined behavior)
2723     // Actually, the flow should never get here until we use MAX_INT limit.
2724   }
2725   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2726                 "max_active_levels for thread %d = (%d)\n",
2727                 gtid, max_active_levels));
2728 
2729   thread = __kmp_threads[gtid];
2730 
2731   __kmp_save_internal_controls(thread);
2732 
2733   set__max_active_levels(thread, max_active_levels);
2734 }
2735 
2736 /* Gets max_active_levels */
2737 int __kmp_get_max_active_levels(int gtid) {
2738   kmp_info_t *thread;
2739 
2740   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2741   KMP_DEBUG_ASSERT(__kmp_init_serial);
2742 
2743   thread = __kmp_threads[gtid];
2744   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2745   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2746                 "curtask_maxaclevel=%d\n",
2747                 gtid, thread->th.th_current_task,
2748                 thread->th.th_current_task->td_icvs.max_active_levels));
2749   return thread->th.th_current_task->td_icvs.max_active_levels;
2750 }
2751 
2752 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2753 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2754 
2755 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2756 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2757   kmp_info_t *thread;
2758   kmp_sched_t orig_kind;
2759   //    kmp_team_t *team;
2760 
2761   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2762                 gtid, (int)kind, chunk));
2763   KMP_DEBUG_ASSERT(__kmp_init_serial);
2764 
2765   // Check if the kind parameter is valid, correct if needed.
2766   // Valid parameters should fit in one of two intervals - standard or extended:
2767   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2768   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2769   orig_kind = kind;
2770   kind = __kmp_sched_without_mods(kind);
2771 
2772   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2773       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2774     // TODO: Hint needs attention in case we change the default schedule.
2775     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2776               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2777               __kmp_msg_null);
2778     kind = kmp_sched_default;
2779     chunk = 0; // ignore chunk value in case of bad kind
2780   }
2781 
2782   thread = __kmp_threads[gtid];
2783 
2784   __kmp_save_internal_controls(thread);
2785 
2786   if (kind < kmp_sched_upper_std) {
2787     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2788       // differ static chunked vs. unchunked:  chunk should be invalid to
2789       // indicate unchunked schedule (which is the default)
2790       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2791     } else {
2792       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2793           __kmp_sch_map[kind - kmp_sched_lower - 1];
2794     }
2795   } else {
2796     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2797     //    kmp_sched_lower - 2 ];
2798     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2799         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2800                       kmp_sched_lower - 2];
2801   }
2802   __kmp_sched_apply_mods_intkind(
2803       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2804   if (kind == kmp_sched_auto || chunk < 1) {
2805     // ignore parameter chunk for schedule auto
2806     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2807   } else {
2808     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2809   }
2810 }
2811 
2812 /* Gets def_sched_var ICV values */
2813 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2814   kmp_info_t *thread;
2815   enum sched_type th_type;
2816 
2817   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2818   KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820   thread = __kmp_threads[gtid];
2821 
2822   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2823   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2824   case kmp_sch_static:
2825   case kmp_sch_static_greedy:
2826   case kmp_sch_static_balanced:
2827     *kind = kmp_sched_static;
2828     __kmp_sched_apply_mods_stdkind(kind, th_type);
2829     *chunk = 0; // chunk was not set, try to show this fact via zero value
2830     return;
2831   case kmp_sch_static_chunked:
2832     *kind = kmp_sched_static;
2833     break;
2834   case kmp_sch_dynamic_chunked:
2835     *kind = kmp_sched_dynamic;
2836     break;
2837   case kmp_sch_guided_chunked:
2838   case kmp_sch_guided_iterative_chunked:
2839   case kmp_sch_guided_analytical_chunked:
2840     *kind = kmp_sched_guided;
2841     break;
2842   case kmp_sch_auto:
2843     *kind = kmp_sched_auto;
2844     break;
2845   case kmp_sch_trapezoidal:
2846     *kind = kmp_sched_trapezoidal;
2847     break;
2848 #if KMP_STATIC_STEAL_ENABLED
2849   case kmp_sch_static_steal:
2850     *kind = kmp_sched_static_steal;
2851     break;
2852 #endif
2853   default:
2854     KMP_FATAL(UnknownSchedulingType, th_type);
2855   }
2856 
2857   __kmp_sched_apply_mods_stdkind(kind, th_type);
2858   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2859 }
2860 
2861 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2862 
2863   int ii, dd;
2864   kmp_team_t *team;
2865   kmp_info_t *thr;
2866 
2867   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2868   KMP_DEBUG_ASSERT(__kmp_init_serial);
2869 
2870   // validate level
2871   if (level == 0)
2872     return 0;
2873   if (level < 0)
2874     return -1;
2875   thr = __kmp_threads[gtid];
2876   team = thr->th.th_team;
2877   ii = team->t.t_level;
2878   if (level > ii)
2879     return -1;
2880 
2881   if (thr->th.th_teams_microtask) {
2882     // AC: we are in teams region where multiple nested teams have same level
2883     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2884     if (level <=
2885         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2886       KMP_DEBUG_ASSERT(ii >= tlevel);
2887       // AC: As we need to pass by the teams league, we need to artificially
2888       // increase ii
2889       if (ii == tlevel) {
2890         ii += 2; // three teams have same level
2891       } else {
2892         ii++; // two teams have same level
2893       }
2894     }
2895   }
2896 
2897   if (ii == level)
2898     return __kmp_tid_from_gtid(gtid);
2899 
2900   dd = team->t.t_serialized;
2901   level++;
2902   while (ii > level) {
2903     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2904     }
2905     if ((team->t.t_serialized) && (!dd)) {
2906       team = team->t.t_parent;
2907       continue;
2908     }
2909     if (ii > level) {
2910       team = team->t.t_parent;
2911       dd = team->t.t_serialized;
2912       ii--;
2913     }
2914   }
2915 
2916   return (dd > 1) ? (0) : (team->t.t_master_tid);
2917 }
2918 
2919 int __kmp_get_team_size(int gtid, int level) {
2920 
2921   int ii, dd;
2922   kmp_team_t *team;
2923   kmp_info_t *thr;
2924 
2925   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2926   KMP_DEBUG_ASSERT(__kmp_init_serial);
2927 
2928   // validate level
2929   if (level == 0)
2930     return 1;
2931   if (level < 0)
2932     return -1;
2933   thr = __kmp_threads[gtid];
2934   team = thr->th.th_team;
2935   ii = team->t.t_level;
2936   if (level > ii)
2937     return -1;
2938 
2939   if (thr->th.th_teams_microtask) {
2940     // AC: we are in teams region where multiple nested teams have same level
2941     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2942     if (level <=
2943         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2944       KMP_DEBUG_ASSERT(ii >= tlevel);
2945       // AC: As we need to pass by the teams league, we need to artificially
2946       // increase ii
2947       if (ii == tlevel) {
2948         ii += 2; // three teams have same level
2949       } else {
2950         ii++; // two teams have same level
2951       }
2952     }
2953   }
2954 
2955   while (ii > level) {
2956     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2957     }
2958     if (team->t.t_serialized && (!dd)) {
2959       team = team->t.t_parent;
2960       continue;
2961     }
2962     if (ii > level) {
2963       team = team->t.t_parent;
2964       ii--;
2965     }
2966   }
2967 
2968   return team->t.t_nproc;
2969 }
2970 
2971 kmp_r_sched_t __kmp_get_schedule_global() {
2972   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2973   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2974   // independently. So one can get the updated schedule here.
2975 
2976   kmp_r_sched_t r_sched;
2977 
2978   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2979   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2980   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2981   // different roots (even in OMP 2.5)
2982   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2983   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2984   if (s == kmp_sch_static) {
2985     // replace STATIC with more detailed schedule (balanced or greedy)
2986     r_sched.r_sched_type = __kmp_static;
2987   } else if (s == kmp_sch_guided_chunked) {
2988     // replace GUIDED with more detailed schedule (iterative or analytical)
2989     r_sched.r_sched_type = __kmp_guided;
2990   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2991     r_sched.r_sched_type = __kmp_sched;
2992   }
2993   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2994 
2995   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2996     // __kmp_chunk may be wrong here (if it was not ever set)
2997     r_sched.chunk = KMP_DEFAULT_CHUNK;
2998   } else {
2999     r_sched.chunk = __kmp_chunk;
3000   }
3001 
3002   return r_sched;
3003 }
3004 
3005 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3006    at least argc number of *t_argv entries for the requested team. */
3007 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3008 
3009   KMP_DEBUG_ASSERT(team);
3010   if (!realloc || argc > team->t.t_max_argc) {
3011 
3012     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3013                    "current entries=%d\n",
3014                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3015     /* if previously allocated heap space for args, free them */
3016     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3017       __kmp_free((void *)team->t.t_argv);
3018 
3019     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3020       /* use unused space in the cache line for arguments */
3021       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3022       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3023                      "argv entries\n",
3024                      team->t.t_id, team->t.t_max_argc));
3025       team->t.t_argv = &team->t.t_inline_argv[0];
3026       if (__kmp_storage_map) {
3027         __kmp_print_storage_map_gtid(
3028             -1, &team->t.t_inline_argv[0],
3029             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3030             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3031             team->t.t_id);
3032       }
3033     } else {
3034       /* allocate space for arguments in the heap */
3035       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3036                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3037                                : 2 * argc;
3038       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3039                      "argv entries\n",
3040                      team->t.t_id, team->t.t_max_argc));
3041       team->t.t_argv =
3042           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3043       if (__kmp_storage_map) {
3044         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3045                                      &team->t.t_argv[team->t.t_max_argc],
3046                                      sizeof(void *) * team->t.t_max_argc,
3047                                      "team_%d.t_argv", team->t.t_id);
3048       }
3049     }
3050   }
3051 }
3052 
3053 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3054   int i;
3055   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3056   team->t.t_threads =
3057       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3058   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3059       sizeof(dispatch_shared_info_t) * num_disp_buff);
3060   team->t.t_dispatch =
3061       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3062   team->t.t_implicit_task_taskdata =
3063       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3064   team->t.t_max_nproc = max_nth;
3065 
3066   /* setup dispatch buffers */
3067   for (i = 0; i < num_disp_buff; ++i) {
3068     team->t.t_disp_buffer[i].buffer_index = i;
3069     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3070   }
3071 }
3072 
3073 static void __kmp_free_team_arrays(kmp_team_t *team) {
3074   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3075   int i;
3076   for (i = 0; i < team->t.t_max_nproc; ++i) {
3077     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3078       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3079       team->t.t_dispatch[i].th_disp_buffer = NULL;
3080     }
3081   }
3082 #if KMP_USE_HIER_SCHED
3083   __kmp_dispatch_free_hierarchies(team);
3084 #endif
3085   __kmp_free(team->t.t_threads);
3086   __kmp_free(team->t.t_disp_buffer);
3087   __kmp_free(team->t.t_dispatch);
3088   __kmp_free(team->t.t_implicit_task_taskdata);
3089   team->t.t_threads = NULL;
3090   team->t.t_disp_buffer = NULL;
3091   team->t.t_dispatch = NULL;
3092   team->t.t_implicit_task_taskdata = 0;
3093 }
3094 
3095 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3096   kmp_info_t **oldThreads = team->t.t_threads;
3097 
3098   __kmp_free(team->t.t_disp_buffer);
3099   __kmp_free(team->t.t_dispatch);
3100   __kmp_free(team->t.t_implicit_task_taskdata);
3101   __kmp_allocate_team_arrays(team, max_nth);
3102 
3103   KMP_MEMCPY(team->t.t_threads, oldThreads,
3104              team->t.t_nproc * sizeof(kmp_info_t *));
3105 
3106   __kmp_free(oldThreads);
3107 }
3108 
3109 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3110 
3111   kmp_r_sched_t r_sched =
3112       __kmp_get_schedule_global(); // get current state of scheduling globals
3113 
3114   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3115 
3116   kmp_internal_control_t g_icvs = {
3117     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3118     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3119     // adjustment of threads (per thread)
3120     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3121     // whether blocktime is explicitly set
3122     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3123 #if KMP_USE_MONITOR
3124     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3125 // intervals
3126 #endif
3127     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3128     // next parallel region (per thread)
3129     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3130     __kmp_cg_max_nth, // int thread_limit;
3131     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3132     // for max_active_levels
3133     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3134     // {sched,chunk} pair
3135     __kmp_nested_proc_bind.bind_types[0],
3136     __kmp_default_device,
3137     NULL // struct kmp_internal_control *next;
3138   };
3139 
3140   return g_icvs;
3141 }
3142 
3143 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3144 
3145   kmp_internal_control_t gx_icvs;
3146   gx_icvs.serial_nesting_level =
3147       0; // probably =team->t.t_serial like in save_inter_controls
3148   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3149   gx_icvs.next = NULL;
3150 
3151   return gx_icvs;
3152 }
3153 
3154 static void __kmp_initialize_root(kmp_root_t *root) {
3155   int f;
3156   kmp_team_t *root_team;
3157   kmp_team_t *hot_team;
3158   int hot_team_max_nth;
3159   kmp_r_sched_t r_sched =
3160       __kmp_get_schedule_global(); // get current state of scheduling globals
3161   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3162   KMP_DEBUG_ASSERT(root);
3163   KMP_ASSERT(!root->r.r_begin);
3164 
3165   /* setup the root state structure */
3166   __kmp_init_lock(&root->r.r_begin_lock);
3167   root->r.r_begin = FALSE;
3168   root->r.r_active = FALSE;
3169   root->r.r_in_parallel = 0;
3170   root->r.r_blocktime = __kmp_dflt_blocktime;
3171 
3172   /* setup the root team for this task */
3173   /* allocate the root team structure */
3174   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3175 
3176   root_team =
3177       __kmp_allocate_team(root,
3178                           1, // new_nproc
3179                           1, // max_nproc
3180 #if OMPT_SUPPORT
3181                           ompt_data_none, // root parallel id
3182 #endif
3183                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3184                           0 // argc
3185                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3186                           );
3187 #if USE_DEBUGGER
3188   // Non-NULL value should be assigned to make the debugger display the root
3189   // team.
3190   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3191 #endif
3192 
3193   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3194 
3195   root->r.r_root_team = root_team;
3196   root_team->t.t_control_stack_top = NULL;
3197 
3198   /* initialize root team */
3199   root_team->t.t_threads[0] = NULL;
3200   root_team->t.t_nproc = 1;
3201   root_team->t.t_serialized = 1;
3202   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3203   root_team->t.t_sched.sched = r_sched.sched;
3204   KA_TRACE(
3205       20,
3206       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3207        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3208 
3209   /* setup the  hot team for this task */
3210   /* allocate the hot team structure */
3211   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3212 
3213   hot_team =
3214       __kmp_allocate_team(root,
3215                           1, // new_nproc
3216                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3217 #if OMPT_SUPPORT
3218                           ompt_data_none, // root parallel id
3219 #endif
3220                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3221                           0 // argc
3222                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3223                           );
3224   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3225 
3226   root->r.r_hot_team = hot_team;
3227   root_team->t.t_control_stack_top = NULL;
3228 
3229   /* first-time initialization */
3230   hot_team->t.t_parent = root_team;
3231 
3232   /* initialize hot team */
3233   hot_team_max_nth = hot_team->t.t_max_nproc;
3234   for (f = 0; f < hot_team_max_nth; ++f) {
3235     hot_team->t.t_threads[f] = NULL;
3236   }
3237   hot_team->t.t_nproc = 1;
3238   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3239   hot_team->t.t_sched.sched = r_sched.sched;
3240   hot_team->t.t_size_changed = 0;
3241 }
3242 
3243 #ifdef KMP_DEBUG
3244 
3245 typedef struct kmp_team_list_item {
3246   kmp_team_p const *entry;
3247   struct kmp_team_list_item *next;
3248 } kmp_team_list_item_t;
3249 typedef kmp_team_list_item_t *kmp_team_list_t;
3250 
3251 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3252     kmp_team_list_t list, // List of teams.
3253     kmp_team_p const *team // Team to add.
3254     ) {
3255 
3256   // List must terminate with item where both entry and next are NULL.
3257   // Team is added to the list only once.
3258   // List is sorted in ascending order by team id.
3259   // Team id is *not* a key.
3260 
3261   kmp_team_list_t l;
3262 
3263   KMP_DEBUG_ASSERT(list != NULL);
3264   if (team == NULL) {
3265     return;
3266   }
3267 
3268   __kmp_print_structure_team_accum(list, team->t.t_parent);
3269   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3270 
3271   // Search list for the team.
3272   l = list;
3273   while (l->next != NULL && l->entry != team) {
3274     l = l->next;
3275   }
3276   if (l->next != NULL) {
3277     return; // Team has been added before, exit.
3278   }
3279 
3280   // Team is not found. Search list again for insertion point.
3281   l = list;
3282   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3283     l = l->next;
3284   }
3285 
3286   // Insert team.
3287   {
3288     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3289         sizeof(kmp_team_list_item_t));
3290     *item = *l;
3291     l->entry = team;
3292     l->next = item;
3293   }
3294 }
3295 
3296 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3297 
3298                                        ) {
3299   __kmp_printf("%s", title);
3300   if (team != NULL) {
3301     __kmp_printf("%2x %p\n", team->t.t_id, team);
3302   } else {
3303     __kmp_printf(" - (nil)\n");
3304   }
3305 }
3306 
3307 static void __kmp_print_structure_thread(char const *title,
3308                                          kmp_info_p const *thread) {
3309   __kmp_printf("%s", title);
3310   if (thread != NULL) {
3311     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3312   } else {
3313     __kmp_printf(" - (nil)\n");
3314   }
3315 }
3316 
3317 void __kmp_print_structure(void) {
3318 
3319   kmp_team_list_t list;
3320 
3321   // Initialize list of teams.
3322   list =
3323       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3324   list->entry = NULL;
3325   list->next = NULL;
3326 
3327   __kmp_printf("\n------------------------------\nGlobal Thread "
3328                "Table\n------------------------------\n");
3329   {
3330     int gtid;
3331     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3332       __kmp_printf("%2d", gtid);
3333       if (__kmp_threads != NULL) {
3334         __kmp_printf(" %p", __kmp_threads[gtid]);
3335       }
3336       if (__kmp_root != NULL) {
3337         __kmp_printf(" %p", __kmp_root[gtid]);
3338       }
3339       __kmp_printf("\n");
3340     }
3341   }
3342 
3343   // Print out __kmp_threads array.
3344   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3345                "----------\n");
3346   if (__kmp_threads != NULL) {
3347     int gtid;
3348     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3349       kmp_info_t const *thread = __kmp_threads[gtid];
3350       if (thread != NULL) {
3351         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3352         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3353         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3354         __kmp_print_structure_team("    Serial Team:  ",
3355                                    thread->th.th_serial_team);
3356         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3357         __kmp_print_structure_thread("    Master:       ",
3358                                      thread->th.th_team_master);
3359         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3360         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3361         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3362         __kmp_print_structure_thread("    Next in pool: ",
3363                                      thread->th.th_next_pool);
3364         __kmp_printf("\n");
3365         __kmp_print_structure_team_accum(list, thread->th.th_team);
3366         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3367       }
3368     }
3369   } else {
3370     __kmp_printf("Threads array is not allocated.\n");
3371   }
3372 
3373   // Print out __kmp_root array.
3374   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3375                "--------\n");
3376   if (__kmp_root != NULL) {
3377     int gtid;
3378     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3379       kmp_root_t const *root = __kmp_root[gtid];
3380       if (root != NULL) {
3381         __kmp_printf("GTID %2d %p:\n", gtid, root);
3382         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3383         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3384         __kmp_print_structure_thread("    Uber Thread:  ",
3385                                      root->r.r_uber_thread);
3386         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3387         __kmp_printf("    In Parallel:  %2d\n",
3388                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3389         __kmp_printf("\n");
3390         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3391         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3392       }
3393     }
3394   } else {
3395     __kmp_printf("Ubers array is not allocated.\n");
3396   }
3397 
3398   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3399                "--------\n");
3400   while (list->next != NULL) {
3401     kmp_team_p const *team = list->entry;
3402     int i;
3403     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3404     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3405     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3406     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3407     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3408     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3409     for (i = 0; i < team->t.t_nproc; ++i) {
3410       __kmp_printf("    Thread %2d:      ", i);
3411       __kmp_print_structure_thread("", team->t.t_threads[i]);
3412     }
3413     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3414     __kmp_printf("\n");
3415     list = list->next;
3416   }
3417 
3418   // Print out __kmp_thread_pool and __kmp_team_pool.
3419   __kmp_printf("\n------------------------------\nPools\n----------------------"
3420                "--------\n");
3421   __kmp_print_structure_thread("Thread pool:          ",
3422                                CCAST(kmp_info_t *, __kmp_thread_pool));
3423   __kmp_print_structure_team("Team pool:            ",
3424                              CCAST(kmp_team_t *, __kmp_team_pool));
3425   __kmp_printf("\n");
3426 
3427   // Free team list.
3428   while (list != NULL) {
3429     kmp_team_list_item_t *item = list;
3430     list = list->next;
3431     KMP_INTERNAL_FREE(item);
3432   }
3433 }
3434 
3435 #endif
3436 
3437 //---------------------------------------------------------------------------
3438 //  Stuff for per-thread fast random number generator
3439 //  Table of primes
3440 static const unsigned __kmp_primes[] = {
3441     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3442     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3443     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3444     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3445     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3446     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3447     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3448     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3449     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3450     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3451     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3452 
3453 //---------------------------------------------------------------------------
3454 //  __kmp_get_random: Get a random number using a linear congruential method.
3455 unsigned short __kmp_get_random(kmp_info_t *thread) {
3456   unsigned x = thread->th.th_x;
3457   unsigned short r = x >> 16;
3458 
3459   thread->th.th_x = x * thread->th.th_a + 1;
3460 
3461   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3462                 thread->th.th_info.ds.ds_tid, r));
3463 
3464   return r;
3465 }
3466 //--------------------------------------------------------
3467 // __kmp_init_random: Initialize a random number generator
3468 void __kmp_init_random(kmp_info_t *thread) {
3469   unsigned seed = thread->th.th_info.ds.ds_tid;
3470 
3471   thread->th.th_a =
3472       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3473   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3474   KA_TRACE(30,
3475            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3476 }
3477 
3478 #if KMP_OS_WINDOWS
3479 /* reclaim array entries for root threads that are already dead, returns number
3480  * reclaimed */
3481 static int __kmp_reclaim_dead_roots(void) {
3482   int i, r = 0;
3483 
3484   for (i = 0; i < __kmp_threads_capacity; ++i) {
3485     if (KMP_UBER_GTID(i) &&
3486         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3487         !__kmp_root[i]
3488              ->r.r_active) { // AC: reclaim only roots died in non-active state
3489       r += __kmp_unregister_root_other_thread(i);
3490     }
3491   }
3492   return r;
3493 }
3494 #endif
3495 
3496 /* This function attempts to create free entries in __kmp_threads and
3497    __kmp_root, and returns the number of free entries generated.
3498 
3499    For Windows* OS static library, the first mechanism used is to reclaim array
3500    entries for root threads that are already dead.
3501 
3502    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3503    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3504    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3505    threadprivate cache array has been created. Synchronization with
3506    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3507 
3508    After any dead root reclamation, if the clipping value allows array expansion
3509    to result in the generation of a total of nNeed free slots, the function does
3510    that expansion. If not, nothing is done beyond the possible initial root
3511    thread reclamation.
3512 
3513    If any argument is negative, the behavior is undefined. */
3514 static int __kmp_expand_threads(int nNeed) {
3515   int added = 0;
3516   int minimumRequiredCapacity;
3517   int newCapacity;
3518   kmp_info_t **newThreads;
3519   kmp_root_t **newRoot;
3520 
3521 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3522 // resizing __kmp_threads does not need additional protection if foreign
3523 // threads are present
3524 
3525 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3526   /* only for Windows static library */
3527   /* reclaim array entries for root threads that are already dead */
3528   added = __kmp_reclaim_dead_roots();
3529 
3530   if (nNeed) {
3531     nNeed -= added;
3532     if (nNeed < 0)
3533       nNeed = 0;
3534   }
3535 #endif
3536   if (nNeed <= 0)
3537     return added;
3538 
3539   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3540   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3541   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3542   // > __kmp_max_nth in one of two ways:
3543   //
3544   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3545   //    may not be reused by another thread, so we may need to increase
3546   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3547   //
3548   // 2) New foreign root(s) are encountered.  We always register new foreign
3549   //    roots. This may cause a smaller # of threads to be allocated at
3550   //    subsequent parallel regions, but the worker threads hang around (and
3551   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3552   //
3553   // Anyway, that is the reason for moving the check to see if
3554   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3555   // instead of having it performed here. -BB
3556 
3557   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3558 
3559   /* compute expansion headroom to check if we can expand */
3560   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3561     /* possible expansion too small -- give up */
3562     return added;
3563   }
3564   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3565 
3566   newCapacity = __kmp_threads_capacity;
3567   do {
3568     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3569                                                           : __kmp_sys_max_nth;
3570   } while (newCapacity < minimumRequiredCapacity);
3571   newThreads = (kmp_info_t **)__kmp_allocate(
3572       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3573   newRoot =
3574       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3575   KMP_MEMCPY(newThreads, __kmp_threads,
3576              __kmp_threads_capacity * sizeof(kmp_info_t *));
3577   KMP_MEMCPY(newRoot, __kmp_root,
3578              __kmp_threads_capacity * sizeof(kmp_root_t *));
3579 
3580   kmp_info_t **temp_threads = __kmp_threads;
3581   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3582   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3583   __kmp_free(temp_threads);
3584   added += newCapacity - __kmp_threads_capacity;
3585   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3586 
3587   if (newCapacity > __kmp_tp_capacity) {
3588     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3589     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3590       __kmp_threadprivate_resize_cache(newCapacity);
3591     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3592       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3593     }
3594     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3595   }
3596 
3597   return added;
3598 }
3599 
3600 /* Register the current thread as a root thread and obtain our gtid. We must
3601    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3602    thread that calls from __kmp_do_serial_initialize() */
3603 int __kmp_register_root(int initial_thread) {
3604   kmp_info_t *root_thread;
3605   kmp_root_t *root;
3606   int gtid;
3607   int capacity;
3608   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3609   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3610   KMP_MB();
3611 
3612   /* 2007-03-02:
3613      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3614      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3615      work as expected -- it may return false (that means there is at least one
3616      empty slot in __kmp_threads array), but it is possible the only free slot
3617      is #0, which is reserved for initial thread and so cannot be used for this
3618      one. Following code workarounds this bug.
3619 
3620      However, right solution seems to be not reserving slot #0 for initial
3621      thread because:
3622      (1) there is no magic in slot #0,
3623      (2) we cannot detect initial thread reliably (the first thread which does
3624         serial initialization may be not a real initial thread).
3625   */
3626   capacity = __kmp_threads_capacity;
3627   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3628     --capacity;
3629   }
3630 
3631   /* see if there are too many threads */
3632   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3633     if (__kmp_tp_cached) {
3634       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3635                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3636                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3637     } else {
3638       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3639                   __kmp_msg_null);
3640     }
3641   }
3642 
3643   /* find an available thread slot */
3644   /* Don't reassign the zero slot since we need that to only be used by initial
3645      thread */
3646   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3647        gtid++)
3648     ;
3649   KA_TRACE(1,
3650            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3651   KMP_ASSERT(gtid < __kmp_threads_capacity);
3652 
3653   /* update global accounting */
3654   __kmp_all_nth++;
3655   TCW_4(__kmp_nth, __kmp_nth + 1);
3656 
3657   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3658   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3659   if (__kmp_adjust_gtid_mode) {
3660     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3661       if (TCR_4(__kmp_gtid_mode) != 2) {
3662         TCW_4(__kmp_gtid_mode, 2);
3663       }
3664     } else {
3665       if (TCR_4(__kmp_gtid_mode) != 1) {
3666         TCW_4(__kmp_gtid_mode, 1);
3667       }
3668     }
3669   }
3670 
3671 #ifdef KMP_ADJUST_BLOCKTIME
3672   /* Adjust blocktime to zero if necessary            */
3673   /* Middle initialization might not have occurred yet */
3674   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3675     if (__kmp_nth > __kmp_avail_proc) {
3676       __kmp_zero_bt = TRUE;
3677     }
3678   }
3679 #endif /* KMP_ADJUST_BLOCKTIME */
3680 
3681   /* setup this new hierarchy */
3682   if (!(root = __kmp_root[gtid])) {
3683     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3684     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3685   }
3686 
3687 #if KMP_STATS_ENABLED
3688   // Initialize stats as soon as possible (right after gtid assignment).
3689   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3690   __kmp_stats_thread_ptr->startLife();
3691   KMP_SET_THREAD_STATE(SERIAL_REGION);
3692   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3693 #endif
3694   __kmp_initialize_root(root);
3695 
3696   /* setup new root thread structure */
3697   if (root->r.r_uber_thread) {
3698     root_thread = root->r.r_uber_thread;
3699   } else {
3700     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3701     if (__kmp_storage_map) {
3702       __kmp_print_thread_storage_map(root_thread, gtid);
3703     }
3704     root_thread->th.th_info.ds.ds_gtid = gtid;
3705 #if OMPT_SUPPORT
3706     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3707 #endif
3708     root_thread->th.th_root = root;
3709     if (__kmp_env_consistency_check) {
3710       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3711     }
3712 #if USE_FAST_MEMORY
3713     __kmp_initialize_fast_memory(root_thread);
3714 #endif /* USE_FAST_MEMORY */
3715 
3716 #if KMP_USE_BGET
3717     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3718     __kmp_initialize_bget(root_thread);
3719 #endif
3720     __kmp_init_random(root_thread); // Initialize random number generator
3721   }
3722 
3723   /* setup the serial team held in reserve by the root thread */
3724   if (!root_thread->th.th_serial_team) {
3725     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3726     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3727     root_thread->th.th_serial_team = __kmp_allocate_team(
3728         root, 1, 1,
3729 #if OMPT_SUPPORT
3730         ompt_data_none, // root parallel id
3731 #endif
3732         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3733   }
3734   KMP_ASSERT(root_thread->th.th_serial_team);
3735   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3736                 root_thread->th.th_serial_team));
3737 
3738   /* drop root_thread into place */
3739   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3740 
3741   root->r.r_root_team->t.t_threads[0] = root_thread;
3742   root->r.r_hot_team->t.t_threads[0] = root_thread;
3743   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3744   // AC: the team created in reserve, not for execution (it is unused for now).
3745   root_thread->th.th_serial_team->t.t_serialized = 0;
3746   root->r.r_uber_thread = root_thread;
3747 
3748   /* initialize the thread, get it ready to go */
3749   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3750   TCW_4(__kmp_init_gtid, TRUE);
3751 
3752   /* prepare the master thread for get_gtid() */
3753   __kmp_gtid_set_specific(gtid);
3754 
3755 #if USE_ITT_BUILD
3756   __kmp_itt_thread_name(gtid);
3757 #endif /* USE_ITT_BUILD */
3758 
3759 #ifdef KMP_TDATA_GTID
3760   __kmp_gtid = gtid;
3761 #endif
3762   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3763   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3764 
3765   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3766                 "plain=%u\n",
3767                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3768                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3769                 KMP_INIT_BARRIER_STATE));
3770   { // Initialize barrier data.
3771     int b;
3772     for (b = 0; b < bs_last_barrier; ++b) {
3773       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3774 #if USE_DEBUGGER
3775       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3776 #endif
3777     }
3778   }
3779   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3780                    KMP_INIT_BARRIER_STATE);
3781 
3782 #if KMP_AFFINITY_SUPPORTED
3783   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3786   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3787   if (TCR_4(__kmp_init_middle)) {
3788     __kmp_affinity_set_init_mask(gtid, TRUE);
3789   }
3790 #endif /* KMP_AFFINITY_SUPPORTED */
3791   root_thread->th.th_def_allocator = __kmp_def_allocator;
3792   root_thread->th.th_prev_level = 0;
3793   root_thread->th.th_prev_num_threads = 1;
3794 
3795   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3796   tmp->cg_root = root_thread;
3797   tmp->cg_thread_limit = __kmp_cg_max_nth;
3798   tmp->cg_nthreads = 1;
3799   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3800                  " cg_nthreads init to 1\n",
3801                  root_thread, tmp));
3802   tmp->up = NULL;
3803   root_thread->th.th_cg_roots = tmp;
3804 
3805   __kmp_root_counter++;
3806 
3807 #if OMPT_SUPPORT
3808   if (!initial_thread && ompt_enabled.enabled) {
3809 
3810     kmp_info_t *root_thread = ompt_get_thread();
3811 
3812     ompt_set_thread_state(root_thread, ompt_state_overhead);
3813 
3814     if (ompt_enabled.ompt_callback_thread_begin) {
3815       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3816           ompt_thread_initial, __ompt_get_thread_data_internal());
3817     }
3818     ompt_data_t *task_data;
3819     ompt_data_t *parallel_data;
3820     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3821     if (ompt_enabled.ompt_callback_implicit_task) {
3822       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3823           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3824     }
3825 
3826     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3827   }
3828 #endif
3829 
3830   KMP_MB();
3831   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3832 
3833   return gtid;
3834 }
3835 
3836 #if KMP_NESTED_HOT_TEAMS
3837 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3838                                 const int max_level) {
3839   int i, n, nth;
3840   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3841   if (!hot_teams || !hot_teams[level].hot_team) {
3842     return 0;
3843   }
3844   KMP_DEBUG_ASSERT(level < max_level);
3845   kmp_team_t *team = hot_teams[level].hot_team;
3846   nth = hot_teams[level].hot_team_nth;
3847   n = nth - 1; // master is not freed
3848   if (level < max_level - 1) {
3849     for (i = 0; i < nth; ++i) {
3850       kmp_info_t *th = team->t.t_threads[i];
3851       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3852       if (i > 0 && th->th.th_hot_teams) {
3853         __kmp_free(th->th.th_hot_teams);
3854         th->th.th_hot_teams = NULL;
3855       }
3856     }
3857   }
3858   __kmp_free_team(root, team, NULL);
3859   return n;
3860 }
3861 #endif
3862 
3863 // Resets a root thread and clear its root and hot teams.
3864 // Returns the number of __kmp_threads entries directly and indirectly freed.
3865 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3866   kmp_team_t *root_team = root->r.r_root_team;
3867   kmp_team_t *hot_team = root->r.r_hot_team;
3868   int n = hot_team->t.t_nproc;
3869   int i;
3870 
3871   KMP_DEBUG_ASSERT(!root->r.r_active);
3872 
3873   root->r.r_root_team = NULL;
3874   root->r.r_hot_team = NULL;
3875   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3876   // before call to __kmp_free_team().
3877   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3878 #if KMP_NESTED_HOT_TEAMS
3879   if (__kmp_hot_teams_max_level >
3880       0) { // need to free nested hot teams and their threads if any
3881     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3882       kmp_info_t *th = hot_team->t.t_threads[i];
3883       if (__kmp_hot_teams_max_level > 1) {
3884         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3885       }
3886       if (th->th.th_hot_teams) {
3887         __kmp_free(th->th.th_hot_teams);
3888         th->th.th_hot_teams = NULL;
3889       }
3890     }
3891   }
3892 #endif
3893   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3894 
3895   // Before we can reap the thread, we need to make certain that all other
3896   // threads in the teams that had this root as ancestor have stopped trying to
3897   // steal tasks.
3898   if (__kmp_tasking_mode != tskm_immediate_exec) {
3899     __kmp_wait_to_unref_task_teams();
3900   }
3901 
3902 #if KMP_OS_WINDOWS
3903   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3904   KA_TRACE(
3905       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3906            "\n",
3907            (LPVOID) & (root->r.r_uber_thread->th),
3908            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3909   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3910 #endif /* KMP_OS_WINDOWS */
3911 
3912 #if OMPT_SUPPORT
3913   ompt_data_t *task_data;
3914   ompt_data_t *parallel_data;
3915   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3916   if (ompt_enabled.ompt_callback_implicit_task) {
3917     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3918         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3919   }
3920   if (ompt_enabled.ompt_callback_thread_end) {
3921     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3922         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3923   }
3924 #endif
3925 
3926   TCW_4(__kmp_nth,
3927         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3928   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3929   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3930                  " to %d\n",
3931                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3932                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3933   if (i == 1) {
3934     // need to free contention group structure
3935     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3936                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3937     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3938     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3939     root->r.r_uber_thread->th.th_cg_roots = NULL;
3940   }
3941   __kmp_reap_thread(root->r.r_uber_thread, 1);
3942 
3943   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3944   // instead of freeing.
3945   root->r.r_uber_thread = NULL;
3946   /* mark root as no longer in use */
3947   root->r.r_begin = FALSE;
3948 
3949   return n;
3950 }
3951 
3952 void __kmp_unregister_root_current_thread(int gtid) {
3953   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3954   /* this lock should be ok, since unregister_root_current_thread is never
3955      called during an abort, only during a normal close. furthermore, if you
3956      have the forkjoin lock, you should never try to get the initz lock */
3957   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3958   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3959     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3960                   "exiting T#%d\n",
3961                   gtid));
3962     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3963     return;
3964   }
3965   kmp_root_t *root = __kmp_root[gtid];
3966 
3967   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3968   KMP_ASSERT(KMP_UBER_GTID(gtid));
3969   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3970   KMP_ASSERT(root->r.r_active == FALSE);
3971 
3972   KMP_MB();
3973 
3974   kmp_info_t *thread = __kmp_threads[gtid];
3975   kmp_team_t *team = thread->th.th_team;
3976   kmp_task_team_t *task_team = thread->th.th_task_team;
3977 
3978   // we need to wait for the proxy tasks before finishing the thread
3979   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3980 #if OMPT_SUPPORT
3981     // the runtime is shutting down so we won't report any events
3982     thread->th.ompt_thread_info.state = ompt_state_undefined;
3983 #endif
3984     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3985   }
3986 
3987   __kmp_reset_root(gtid, root);
3988 
3989   /* free up this thread slot */
3990   __kmp_gtid_set_specific(KMP_GTID_DNE);
3991 #ifdef KMP_TDATA_GTID
3992   __kmp_gtid = KMP_GTID_DNE;
3993 #endif
3994 
3995   KMP_MB();
3996   KC_TRACE(10,
3997            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3998 
3999   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4000 }
4001 
4002 #if KMP_OS_WINDOWS
4003 /* __kmp_forkjoin_lock must be already held
4004    Unregisters a root thread that is not the current thread.  Returns the number
4005    of __kmp_threads entries freed as a result. */
4006 static int __kmp_unregister_root_other_thread(int gtid) {
4007   kmp_root_t *root = __kmp_root[gtid];
4008   int r;
4009 
4010   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4011   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4012   KMP_ASSERT(KMP_UBER_GTID(gtid));
4013   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4014   KMP_ASSERT(root->r.r_active == FALSE);
4015 
4016   r = __kmp_reset_root(gtid, root);
4017   KC_TRACE(10,
4018            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4019   return r;
4020 }
4021 #endif
4022 
4023 #if KMP_DEBUG
4024 void __kmp_task_info() {
4025 
4026   kmp_int32 gtid = __kmp_entry_gtid();
4027   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4028   kmp_info_t *this_thr = __kmp_threads[gtid];
4029   kmp_team_t *steam = this_thr->th.th_serial_team;
4030   kmp_team_t *team = this_thr->th.th_team;
4031 
4032   __kmp_printf(
4033       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4034       "ptask=%p\n",
4035       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4036       team->t.t_implicit_task_taskdata[tid].td_parent);
4037 }
4038 #endif // KMP_DEBUG
4039 
4040 /* TODO optimize with one big memclr, take out what isn't needed, split
4041    responsibility to workers as much as possible, and delay initialization of
4042    features as much as possible  */
4043 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4044                                   int tid, int gtid) {
4045   /* this_thr->th.th_info.ds.ds_gtid is setup in
4046      kmp_allocate_thread/create_worker.
4047      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4048   kmp_info_t *master = team->t.t_threads[0];
4049   KMP_DEBUG_ASSERT(this_thr != NULL);
4050   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4051   KMP_DEBUG_ASSERT(team);
4052   KMP_DEBUG_ASSERT(team->t.t_threads);
4053   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4054   KMP_DEBUG_ASSERT(master);
4055   KMP_DEBUG_ASSERT(master->th.th_root);
4056 
4057   KMP_MB();
4058 
4059   TCW_SYNC_PTR(this_thr->th.th_team, team);
4060 
4061   this_thr->th.th_info.ds.ds_tid = tid;
4062   this_thr->th.th_set_nproc = 0;
4063   if (__kmp_tasking_mode != tskm_immediate_exec)
4064     // When tasking is possible, threads are not safe to reap until they are
4065     // done tasking; this will be set when tasking code is exited in wait
4066     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4067   else // no tasking --> always safe to reap
4068     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4069   this_thr->th.th_set_proc_bind = proc_bind_default;
4070 #if KMP_AFFINITY_SUPPORTED
4071   this_thr->th.th_new_place = this_thr->th.th_current_place;
4072 #endif
4073   this_thr->th.th_root = master->th.th_root;
4074 
4075   /* setup the thread's cache of the team structure */
4076   this_thr->th.th_team_nproc = team->t.t_nproc;
4077   this_thr->th.th_team_master = master;
4078   this_thr->th.th_team_serialized = team->t.t_serialized;
4079   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4080 
4081   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4082 
4083   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4084                 tid, gtid, this_thr, this_thr->th.th_current_task));
4085 
4086   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4087                            team, tid, TRUE);
4088 
4089   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4090                 tid, gtid, this_thr, this_thr->th.th_current_task));
4091   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4092   // __kmp_initialize_team()?
4093 
4094   /* TODO no worksharing in speculative threads */
4095   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4096 
4097   this_thr->th.th_local.this_construct = 0;
4098 
4099   if (!this_thr->th.th_pri_common) {
4100     this_thr->th.th_pri_common =
4101         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4102     if (__kmp_storage_map) {
4103       __kmp_print_storage_map_gtid(
4104           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4105           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4106     }
4107     this_thr->th.th_pri_head = NULL;
4108   }
4109 
4110   if (this_thr != master && // Master's CG root is initialized elsewhere
4111       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4112     // Make new thread's CG root same as master's
4113     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4114     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4115     if (tmp) {
4116       // worker changes CG, need to check if old CG should be freed
4117       int i = tmp->cg_nthreads--;
4118       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4119                      " on node %p of thread %p to %d\n",
4120                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4121       if (i == 1) {
4122         __kmp_free(tmp); // last thread left CG --> free it
4123       }
4124     }
4125     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4126     // Increment new thread's CG root's counter to add the new thread
4127     this_thr->th.th_cg_roots->cg_nthreads++;
4128     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4129                    " node %p of thread %p to %d\n",
4130                    this_thr, this_thr->th.th_cg_roots,
4131                    this_thr->th.th_cg_roots->cg_root,
4132                    this_thr->th.th_cg_roots->cg_nthreads));
4133     this_thr->th.th_current_task->td_icvs.thread_limit =
4134         this_thr->th.th_cg_roots->cg_thread_limit;
4135   }
4136 
4137   /* Initialize dynamic dispatch */
4138   {
4139     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4140     // Use team max_nproc since this will never change for the team.
4141     size_t disp_size =
4142         sizeof(dispatch_private_info_t) *
4143         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4144     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4145                   team->t.t_max_nproc));
4146     KMP_ASSERT(dispatch);
4147     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4148     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4149 
4150     dispatch->th_disp_index = 0;
4151     dispatch->th_doacross_buf_idx = 0;
4152     if (!dispatch->th_disp_buffer) {
4153       dispatch->th_disp_buffer =
4154           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4155 
4156       if (__kmp_storage_map) {
4157         __kmp_print_storage_map_gtid(
4158             gtid, &dispatch->th_disp_buffer[0],
4159             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4160                                           ? 1
4161                                           : __kmp_dispatch_num_buffers],
4162             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4163                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4164             gtid, team->t.t_id, gtid);
4165       }
4166     } else {
4167       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4168     }
4169 
4170     dispatch->th_dispatch_pr_current = 0;
4171     dispatch->th_dispatch_sh_current = 0;
4172 
4173     dispatch->th_deo_fcn = 0; /* ORDERED     */
4174     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4175   }
4176 
4177   this_thr->th.th_next_pool = NULL;
4178 
4179   if (!this_thr->th.th_task_state_memo_stack) {
4180     size_t i;
4181     this_thr->th.th_task_state_memo_stack =
4182         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4183     this_thr->th.th_task_state_top = 0;
4184     this_thr->th.th_task_state_stack_sz = 4;
4185     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4186          ++i) // zero init the stack
4187       this_thr->th.th_task_state_memo_stack[i] = 0;
4188   }
4189 
4190   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4191   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4192 
4193   KMP_MB();
4194 }
4195 
4196 /* allocate a new thread for the requesting team. this is only called from
4197    within a forkjoin critical section. we will first try to get an available
4198    thread from the thread pool. if none is available, we will fork a new one
4199    assuming we are able to create a new one. this should be assured, as the
4200    caller should check on this first. */
4201 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4202                                   int new_tid) {
4203   kmp_team_t *serial_team;
4204   kmp_info_t *new_thr;
4205   int new_gtid;
4206 
4207   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4208   KMP_DEBUG_ASSERT(root && team);
4209 #if !KMP_NESTED_HOT_TEAMS
4210   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4211 #endif
4212   KMP_MB();
4213 
4214   /* first, try to get one from the thread pool */
4215   if (__kmp_thread_pool) {
4216     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4217     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4218     if (new_thr == __kmp_thread_pool_insert_pt) {
4219       __kmp_thread_pool_insert_pt = NULL;
4220     }
4221     TCW_4(new_thr->th.th_in_pool, FALSE);
4222     __kmp_suspend_initialize_thread(new_thr);
4223     __kmp_lock_suspend_mx(new_thr);
4224     if (new_thr->th.th_active_in_pool == TRUE) {
4225       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4226       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4227       new_thr->th.th_active_in_pool = FALSE;
4228     }
4229     __kmp_unlock_suspend_mx(new_thr);
4230 
4231     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4232                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4233     KMP_ASSERT(!new_thr->th.th_team);
4234     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4235 
4236     /* setup the thread structure */
4237     __kmp_initialize_info(new_thr, team, new_tid,
4238                           new_thr->th.th_info.ds.ds_gtid);
4239     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4240 
4241     TCW_4(__kmp_nth, __kmp_nth + 1);
4242 
4243     new_thr->th.th_task_state = 0;
4244     new_thr->th.th_task_state_top = 0;
4245     new_thr->th.th_task_state_stack_sz = 4;
4246 
4247 #ifdef KMP_ADJUST_BLOCKTIME
4248     /* Adjust blocktime back to zero if necessary */
4249     /* Middle initialization might not have occurred yet */
4250     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4251       if (__kmp_nth > __kmp_avail_proc) {
4252         __kmp_zero_bt = TRUE;
4253       }
4254     }
4255 #endif /* KMP_ADJUST_BLOCKTIME */
4256 
4257 #if KMP_DEBUG
4258     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4259     // KMP_BARRIER_PARENT_FLAG.
4260     int b;
4261     kmp_balign_t *balign = new_thr->th.th_bar;
4262     for (b = 0; b < bs_last_barrier; ++b)
4263       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4264 #endif
4265 
4266     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4267                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4268 
4269     KMP_MB();
4270     return new_thr;
4271   }
4272 
4273   /* no, well fork a new one */
4274   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4275   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4276 
4277 #if KMP_USE_MONITOR
4278   // If this is the first worker thread the RTL is creating, then also
4279   // launch the monitor thread.  We try to do this as early as possible.
4280   if (!TCR_4(__kmp_init_monitor)) {
4281     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4282     if (!TCR_4(__kmp_init_monitor)) {
4283       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4284       TCW_4(__kmp_init_monitor, 1);
4285       __kmp_create_monitor(&__kmp_monitor);
4286       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4287 #if KMP_OS_WINDOWS
4288       // AC: wait until monitor has started. This is a fix for CQ232808.
4289       // The reason is that if the library is loaded/unloaded in a loop with
4290       // small (parallel) work in between, then there is high probability that
4291       // monitor thread started after the library shutdown. At shutdown it is
4292       // too late to cope with the problem, because when the master is in
4293       // DllMain (process detach) the monitor has no chances to start (it is
4294       // blocked), and master has no means to inform the monitor that the
4295       // library has gone, because all the memory which the monitor can access
4296       // is going to be released/reset.
4297       while (TCR_4(__kmp_init_monitor) < 2) {
4298         KMP_YIELD(TRUE);
4299       }
4300       KF_TRACE(10, ("after monitor thread has started\n"));
4301 #endif
4302     }
4303     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4304   }
4305 #endif
4306 
4307   KMP_MB();
4308   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4309     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4310   }
4311 
4312   /* allocate space for it. */
4313   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4314 
4315   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4316 
4317 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4318   // suppress race conditions detection on synchronization flags in debug mode
4319   // this helps to analyze library internals eliminating false positives
4320   __itt_suppress_mark_range(
4321       __itt_suppress_range, __itt_suppress_threading_errors,
4322       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4323   __itt_suppress_mark_range(
4324       __itt_suppress_range, __itt_suppress_threading_errors,
4325       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4326 #if KMP_OS_WINDOWS
4327   __itt_suppress_mark_range(
4328       __itt_suppress_range, __itt_suppress_threading_errors,
4329       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4330 #else
4331   __itt_suppress_mark_range(__itt_suppress_range,
4332                             __itt_suppress_threading_errors,
4333                             &new_thr->th.th_suspend_init_count,
4334                             sizeof(new_thr->th.th_suspend_init_count));
4335 #endif
4336   // TODO: check if we need to also suppress b_arrived flags
4337   __itt_suppress_mark_range(__itt_suppress_range,
4338                             __itt_suppress_threading_errors,
4339                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4340                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4341   __itt_suppress_mark_range(__itt_suppress_range,
4342                             __itt_suppress_threading_errors,
4343                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4344                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4345   __itt_suppress_mark_range(__itt_suppress_range,
4346                             __itt_suppress_threading_errors,
4347                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4348                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4349 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4350   if (__kmp_storage_map) {
4351     __kmp_print_thread_storage_map(new_thr, new_gtid);
4352   }
4353 
4354   // add the reserve serialized team, initialized from the team's master thread
4355   {
4356     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4357     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4358     new_thr->th.th_serial_team = serial_team =
4359         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4360 #if OMPT_SUPPORT
4361                                           ompt_data_none, // root parallel id
4362 #endif
4363                                           proc_bind_default, &r_icvs,
4364                                           0 USE_NESTED_HOT_ARG(NULL));
4365   }
4366   KMP_ASSERT(serial_team);
4367   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4368   // execution (it is unused for now).
4369   serial_team->t.t_threads[0] = new_thr;
4370   KF_TRACE(10,
4371            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4372             new_thr));
4373 
4374   /* setup the thread structures */
4375   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4376 
4377 #if USE_FAST_MEMORY
4378   __kmp_initialize_fast_memory(new_thr);
4379 #endif /* USE_FAST_MEMORY */
4380 
4381 #if KMP_USE_BGET
4382   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4383   __kmp_initialize_bget(new_thr);
4384 #endif
4385 
4386   __kmp_init_random(new_thr); // Initialize random number generator
4387 
4388   /* Initialize these only once when thread is grabbed for a team allocation */
4389   KA_TRACE(20,
4390            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4391             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4392 
4393   int b;
4394   kmp_balign_t *balign = new_thr->th.th_bar;
4395   for (b = 0; b < bs_last_barrier; ++b) {
4396     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4397     balign[b].bb.team = NULL;
4398     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4399     balign[b].bb.use_oncore_barrier = 0;
4400   }
4401 
4402   new_thr->th.th_spin_here = FALSE;
4403   new_thr->th.th_next_waiting = 0;
4404 #if KMP_OS_UNIX
4405   new_thr->th.th_blocking = false;
4406 #endif
4407 
4408 #if KMP_AFFINITY_SUPPORTED
4409   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4410   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4411   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4412   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4413 #endif
4414   new_thr->th.th_def_allocator = __kmp_def_allocator;
4415   new_thr->th.th_prev_level = 0;
4416   new_thr->th.th_prev_num_threads = 1;
4417 
4418   TCW_4(new_thr->th.th_in_pool, FALSE);
4419   new_thr->th.th_active_in_pool = FALSE;
4420   TCW_4(new_thr->th.th_active, TRUE);
4421 
4422   /* adjust the global counters */
4423   __kmp_all_nth++;
4424   __kmp_nth++;
4425 
4426   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4427   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4428   if (__kmp_adjust_gtid_mode) {
4429     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4430       if (TCR_4(__kmp_gtid_mode) != 2) {
4431         TCW_4(__kmp_gtid_mode, 2);
4432       }
4433     } else {
4434       if (TCR_4(__kmp_gtid_mode) != 1) {
4435         TCW_4(__kmp_gtid_mode, 1);
4436       }
4437     }
4438   }
4439 
4440 #ifdef KMP_ADJUST_BLOCKTIME
4441   /* Adjust blocktime back to zero if necessary       */
4442   /* Middle initialization might not have occurred yet */
4443   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4444     if (__kmp_nth > __kmp_avail_proc) {
4445       __kmp_zero_bt = TRUE;
4446     }
4447   }
4448 #endif /* KMP_ADJUST_BLOCKTIME */
4449 
4450   /* actually fork it and create the new worker thread */
4451   KF_TRACE(
4452       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4453   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4454   KF_TRACE(10,
4455            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4456 
4457   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4458                 new_gtid));
4459   KMP_MB();
4460   return new_thr;
4461 }
4462 
4463 /* Reinitialize team for reuse.
4464    The hot team code calls this case at every fork barrier, so EPCC barrier
4465    test are extremely sensitive to changes in it, esp. writes to the team
4466    struct, which cause a cache invalidation in all threads.
4467    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4468 static void __kmp_reinitialize_team(kmp_team_t *team,
4469                                     kmp_internal_control_t *new_icvs,
4470                                     ident_t *loc) {
4471   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4472                 team->t.t_threads[0], team));
4473   KMP_DEBUG_ASSERT(team && new_icvs);
4474   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4475   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4476 
4477   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4478   // Copy ICVs to the master thread's implicit taskdata
4479   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4480   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4481 
4482   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4483                 team->t.t_threads[0], team));
4484 }
4485 
4486 /* Initialize the team data structure.
4487    This assumes the t_threads and t_max_nproc are already set.
4488    Also, we don't touch the arguments */
4489 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4490                                   kmp_internal_control_t *new_icvs,
4491                                   ident_t *loc) {
4492   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4493 
4494   /* verify */
4495   KMP_DEBUG_ASSERT(team);
4496   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4497   KMP_DEBUG_ASSERT(team->t.t_threads);
4498   KMP_MB();
4499 
4500   team->t.t_master_tid = 0; /* not needed */
4501   /* team->t.t_master_bar;        not needed */
4502   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4503   team->t.t_nproc = new_nproc;
4504 
4505   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4506   team->t.t_next_pool = NULL;
4507   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4508    * up hot team */
4509 
4510   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4511   team->t.t_invoke = NULL; /* not needed */
4512 
4513   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4514   team->t.t_sched.sched = new_icvs->sched.sched;
4515 
4516 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4517   team->t.t_fp_control_saved = FALSE; /* not needed */
4518   team->t.t_x87_fpu_control_word = 0; /* not needed */
4519   team->t.t_mxcsr = 0; /* not needed */
4520 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4521 
4522   team->t.t_construct = 0;
4523 
4524   team->t.t_ordered.dt.t_value = 0;
4525   team->t.t_master_active = FALSE;
4526 
4527 #ifdef KMP_DEBUG
4528   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4529 #endif
4530 #if KMP_OS_WINDOWS
4531   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4532 #endif
4533 
4534   team->t.t_control_stack_top = NULL;
4535 
4536   __kmp_reinitialize_team(team, new_icvs, loc);
4537 
4538   KMP_MB();
4539   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4540 }
4541 
4542 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4543 /* Sets full mask for thread and returns old mask, no changes to structures. */
4544 static void
4545 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4546   if (KMP_AFFINITY_CAPABLE()) {
4547     int status;
4548     if (old_mask != NULL) {
4549       status = __kmp_get_system_affinity(old_mask, TRUE);
4550       int error = errno;
4551       if (status != 0) {
4552         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4553                     __kmp_msg_null);
4554       }
4555     }
4556     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4557   }
4558 }
4559 #endif
4560 
4561 #if KMP_AFFINITY_SUPPORTED
4562 
4563 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4564 // It calculates the worker + master thread's partition based upon the parent
4565 // thread's partition, and binds each worker to a thread in their partition.
4566 // The master thread's partition should already include its current binding.
4567 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4568   // Copy the master thread's place partition to the team struct
4569   kmp_info_t *master_th = team->t.t_threads[0];
4570   KMP_DEBUG_ASSERT(master_th != NULL);
4571   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4572   int first_place = master_th->th.th_first_place;
4573   int last_place = master_th->th.th_last_place;
4574   int masters_place = master_th->th.th_current_place;
4575   team->t.t_first_place = first_place;
4576   team->t.t_last_place = last_place;
4577 
4578   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4579                 "bound to place %d partition = [%d,%d]\n",
4580                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4581                 team->t.t_id, masters_place, first_place, last_place));
4582 
4583   switch (proc_bind) {
4584 
4585   case proc_bind_default:
4586     // serial teams might have the proc_bind policy set to proc_bind_default. It
4587     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4588     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4589     break;
4590 
4591   case proc_bind_master: {
4592     int f;
4593     int n_th = team->t.t_nproc;
4594     for (f = 1; f < n_th; f++) {
4595       kmp_info_t *th = team->t.t_threads[f];
4596       KMP_DEBUG_ASSERT(th != NULL);
4597       th->th.th_first_place = first_place;
4598       th->th.th_last_place = last_place;
4599       th->th.th_new_place = masters_place;
4600       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4601           team->t.t_display_affinity != 1) {
4602         team->t.t_display_affinity = 1;
4603       }
4604 
4605       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4606                      "partition = [%d,%d]\n",
4607                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4608                      f, masters_place, first_place, last_place));
4609     }
4610   } break;
4611 
4612   case proc_bind_close: {
4613     int f;
4614     int n_th = team->t.t_nproc;
4615     int n_places;
4616     if (first_place <= last_place) {
4617       n_places = last_place - first_place + 1;
4618     } else {
4619       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4620     }
4621     if (n_th <= n_places) {
4622       int place = masters_place;
4623       for (f = 1; f < n_th; f++) {
4624         kmp_info_t *th = team->t.t_threads[f];
4625         KMP_DEBUG_ASSERT(th != NULL);
4626 
4627         if (place == last_place) {
4628           place = first_place;
4629         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4630           place = 0;
4631         } else {
4632           place++;
4633         }
4634         th->th.th_first_place = first_place;
4635         th->th.th_last_place = last_place;
4636         th->th.th_new_place = place;
4637         if (__kmp_display_affinity && place != th->th.th_current_place &&
4638             team->t.t_display_affinity != 1) {
4639           team->t.t_display_affinity = 1;
4640         }
4641 
4642         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4643                        "partition = [%d,%d]\n",
4644                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4645                        team->t.t_id, f, place, first_place, last_place));
4646       }
4647     } else {
4648       int S, rem, gap, s_count;
4649       S = n_th / n_places;
4650       s_count = 0;
4651       rem = n_th - (S * n_places);
4652       gap = rem > 0 ? n_places / rem : n_places;
4653       int place = masters_place;
4654       int gap_ct = gap;
4655       for (f = 0; f < n_th; f++) {
4656         kmp_info_t *th = team->t.t_threads[f];
4657         KMP_DEBUG_ASSERT(th != NULL);
4658 
4659         th->th.th_first_place = first_place;
4660         th->th.th_last_place = last_place;
4661         th->th.th_new_place = place;
4662         if (__kmp_display_affinity && place != th->th.th_current_place &&
4663             team->t.t_display_affinity != 1) {
4664           team->t.t_display_affinity = 1;
4665         }
4666         s_count++;
4667 
4668         if ((s_count == S) && rem && (gap_ct == gap)) {
4669           // do nothing, add an extra thread to place on next iteration
4670         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4671           // we added an extra thread to this place; move to next place
4672           if (place == last_place) {
4673             place = first_place;
4674           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675             place = 0;
4676           } else {
4677             place++;
4678           }
4679           s_count = 0;
4680           gap_ct = 1;
4681           rem--;
4682         } else if (s_count == S) { // place full; don't add extra
4683           if (place == last_place) {
4684             place = first_place;
4685           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4686             place = 0;
4687           } else {
4688             place++;
4689           }
4690           gap_ct++;
4691           s_count = 0;
4692         }
4693 
4694         KA_TRACE(100,
4695                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4696                   "partition = [%d,%d]\n",
4697                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4698                   th->th.th_new_place, first_place, last_place));
4699       }
4700       KMP_DEBUG_ASSERT(place == masters_place);
4701     }
4702   } break;
4703 
4704   case proc_bind_spread: {
4705     int f;
4706     int n_th = team->t.t_nproc;
4707     int n_places;
4708     int thidx;
4709     if (first_place <= last_place) {
4710       n_places = last_place - first_place + 1;
4711     } else {
4712       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4713     }
4714     if (n_th <= n_places) {
4715       int place = -1;
4716 
4717       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4718         int S = n_places / n_th;
4719         int s_count, rem, gap, gap_ct;
4720 
4721         place = masters_place;
4722         rem = n_places - n_th * S;
4723         gap = rem ? n_th / rem : 1;
4724         gap_ct = gap;
4725         thidx = n_th;
4726         if (update_master_only == 1)
4727           thidx = 1;
4728         for (f = 0; f < thidx; f++) {
4729           kmp_info_t *th = team->t.t_threads[f];
4730           KMP_DEBUG_ASSERT(th != NULL);
4731 
4732           th->th.th_first_place = place;
4733           th->th.th_new_place = place;
4734           if (__kmp_display_affinity && place != th->th.th_current_place &&
4735               team->t.t_display_affinity != 1) {
4736             team->t.t_display_affinity = 1;
4737           }
4738           s_count = 1;
4739           while (s_count < S) {
4740             if (place == last_place) {
4741               place = first_place;
4742             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4743               place = 0;
4744             } else {
4745               place++;
4746             }
4747             s_count++;
4748           }
4749           if (rem && (gap_ct == gap)) {
4750             if (place == last_place) {
4751               place = first_place;
4752             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4753               place = 0;
4754             } else {
4755               place++;
4756             }
4757             rem--;
4758             gap_ct = 0;
4759           }
4760           th->th.th_last_place = place;
4761           gap_ct++;
4762 
4763           if (place == last_place) {
4764             place = first_place;
4765           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4766             place = 0;
4767           } else {
4768             place++;
4769           }
4770 
4771           KA_TRACE(100,
4772                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4773                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4774                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4775                     f, th->th.th_new_place, th->th.th_first_place,
4776                     th->th.th_last_place, __kmp_affinity_num_masks));
4777         }
4778       } else {
4779         /* Having uniform space of available computation places I can create
4780            T partitions of round(P/T) size and put threads into the first
4781            place of each partition. */
4782         double current = static_cast<double>(masters_place);
4783         double spacing =
4784             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4785         int first, last;
4786         kmp_info_t *th;
4787 
4788         thidx = n_th + 1;
4789         if (update_master_only == 1)
4790           thidx = 1;
4791         for (f = 0; f < thidx; f++) {
4792           first = static_cast<int>(current);
4793           last = static_cast<int>(current + spacing) - 1;
4794           KMP_DEBUG_ASSERT(last >= first);
4795           if (first >= n_places) {
4796             if (masters_place) {
4797               first -= n_places;
4798               last -= n_places;
4799               if (first == (masters_place + 1)) {
4800                 KMP_DEBUG_ASSERT(f == n_th);
4801                 first--;
4802               }
4803               if (last == masters_place) {
4804                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4805                 last--;
4806               }
4807             } else {
4808               KMP_DEBUG_ASSERT(f == n_th);
4809               first = 0;
4810               last = 0;
4811             }
4812           }
4813           if (last >= n_places) {
4814             last = (n_places - 1);
4815           }
4816           place = first;
4817           current += spacing;
4818           if (f < n_th) {
4819             KMP_DEBUG_ASSERT(0 <= first);
4820             KMP_DEBUG_ASSERT(n_places > first);
4821             KMP_DEBUG_ASSERT(0 <= last);
4822             KMP_DEBUG_ASSERT(n_places > last);
4823             KMP_DEBUG_ASSERT(last_place >= first_place);
4824             th = team->t.t_threads[f];
4825             KMP_DEBUG_ASSERT(th);
4826             th->th.th_first_place = first;
4827             th->th.th_new_place = place;
4828             th->th.th_last_place = last;
4829             if (__kmp_display_affinity && place != th->th.th_current_place &&
4830                 team->t.t_display_affinity != 1) {
4831               team->t.t_display_affinity = 1;
4832             }
4833             KA_TRACE(100,
4834                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4835                       "partition = [%d,%d], spacing = %.4f\n",
4836                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4837                       team->t.t_id, f, th->th.th_new_place,
4838                       th->th.th_first_place, th->th.th_last_place, spacing));
4839           }
4840         }
4841       }
4842       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4843     } else {
4844       int S, rem, gap, s_count;
4845       S = n_th / n_places;
4846       s_count = 0;
4847       rem = n_th - (S * n_places);
4848       gap = rem > 0 ? n_places / rem : n_places;
4849       int place = masters_place;
4850       int gap_ct = gap;
4851       thidx = n_th;
4852       if (update_master_only == 1)
4853         thidx = 1;
4854       for (f = 0; f < thidx; f++) {
4855         kmp_info_t *th = team->t.t_threads[f];
4856         KMP_DEBUG_ASSERT(th != NULL);
4857 
4858         th->th.th_first_place = place;
4859         th->th.th_last_place = place;
4860         th->th.th_new_place = place;
4861         if (__kmp_display_affinity && place != th->th.th_current_place &&
4862             team->t.t_display_affinity != 1) {
4863           team->t.t_display_affinity = 1;
4864         }
4865         s_count++;
4866 
4867         if ((s_count == S) && rem && (gap_ct == gap)) {
4868           // do nothing, add an extra thread to place on next iteration
4869         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4870           // we added an extra thread to this place; move on to next place
4871           if (place == last_place) {
4872             place = first_place;
4873           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4874             place = 0;
4875           } else {
4876             place++;
4877           }
4878           s_count = 0;
4879           gap_ct = 1;
4880           rem--;
4881         } else if (s_count == S) { // place is full; don't add extra thread
4882           if (place == last_place) {
4883             place = first_place;
4884           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4885             place = 0;
4886           } else {
4887             place++;
4888           }
4889           gap_ct++;
4890           s_count = 0;
4891         }
4892 
4893         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4894                        "partition = [%d,%d]\n",
4895                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4896                        team->t.t_id, f, th->th.th_new_place,
4897                        th->th.th_first_place, th->th.th_last_place));
4898       }
4899       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4900     }
4901   } break;
4902 
4903   default:
4904     break;
4905   }
4906 
4907   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4908 }
4909 
4910 #endif // KMP_AFFINITY_SUPPORTED
4911 
4912 /* allocate a new team data structure to use.  take one off of the free pool if
4913    available */
4914 kmp_team_t *
4915 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4916 #if OMPT_SUPPORT
4917                     ompt_data_t ompt_parallel_data,
4918 #endif
4919                     kmp_proc_bind_t new_proc_bind,
4920                     kmp_internal_control_t *new_icvs,
4921                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4922   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4923   int f;
4924   kmp_team_t *team;
4925   int use_hot_team = !root->r.r_active;
4926   int level = 0;
4927 
4928   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4929   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4930   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4931   KMP_MB();
4932 
4933 #if KMP_NESTED_HOT_TEAMS
4934   kmp_hot_team_ptr_t *hot_teams;
4935   if (master) {
4936     team = master->th.th_team;
4937     level = team->t.t_active_level;
4938     if (master->th.th_teams_microtask) { // in teams construct?
4939       if (master->th.th_teams_size.nteams > 1 &&
4940           ( // #teams > 1
4941               team->t.t_pkfn ==
4942                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4943               master->th.th_teams_level <
4944                   team->t.t_level)) { // or nested parallel inside the teams
4945         ++level; // not increment if #teams==1, or for outer fork of the teams;
4946         // increment otherwise
4947       }
4948     }
4949     hot_teams = master->th.th_hot_teams;
4950     if (level < __kmp_hot_teams_max_level && hot_teams &&
4951         hot_teams[level].hot_team) {
4952       // hot team has already been allocated for given level
4953       use_hot_team = 1;
4954     } else {
4955       use_hot_team = 0;
4956     }
4957   } else {
4958     // check we won't access uninitialized hot_teams, just in case
4959     KMP_DEBUG_ASSERT(new_nproc == 1);
4960   }
4961 #endif
4962   // Optimization to use a "hot" team
4963   if (use_hot_team && new_nproc > 1) {
4964     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4965 #if KMP_NESTED_HOT_TEAMS
4966     team = hot_teams[level].hot_team;
4967 #else
4968     team = root->r.r_hot_team;
4969 #endif
4970 #if KMP_DEBUG
4971     if (__kmp_tasking_mode != tskm_immediate_exec) {
4972       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4973                     "task_team[1] = %p before reinit\n",
4974                     team->t.t_task_team[0], team->t.t_task_team[1]));
4975     }
4976 #endif
4977 
4978     // Has the number of threads changed?
4979     /* Let's assume the most common case is that the number of threads is
4980        unchanged, and put that case first. */
4981     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4982       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4983       // This case can mean that omp_set_num_threads() was called and the hot
4984       // team size was already reduced, so we check the special flag
4985       if (team->t.t_size_changed == -1) {
4986         team->t.t_size_changed = 1;
4987       } else {
4988         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4989       }
4990 
4991       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4992       kmp_r_sched_t new_sched = new_icvs->sched;
4993       // set master's schedule as new run-time schedule
4994       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4995 
4996       __kmp_reinitialize_team(team, new_icvs,
4997                               root->r.r_uber_thread->th.th_ident);
4998 
4999       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5000                     team->t.t_threads[0], team));
5001       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5002 
5003 #if KMP_AFFINITY_SUPPORTED
5004       if ((team->t.t_size_changed == 0) &&
5005           (team->t.t_proc_bind == new_proc_bind)) {
5006         if (new_proc_bind == proc_bind_spread) {
5007           __kmp_partition_places(
5008               team, 1); // add flag to update only master for spread
5009         }
5010         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5011                        "proc_bind = %d, partition = [%d,%d]\n",
5012                        team->t.t_id, new_proc_bind, team->t.t_first_place,
5013                        team->t.t_last_place));
5014       } else {
5015         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5016         __kmp_partition_places(team);
5017       }
5018 #else
5019       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5020 #endif /* KMP_AFFINITY_SUPPORTED */
5021     } else if (team->t.t_nproc > new_nproc) {
5022       KA_TRACE(20,
5023                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5024                 new_nproc));
5025 
5026       team->t.t_size_changed = 1;
5027 #if KMP_NESTED_HOT_TEAMS
5028       if (__kmp_hot_teams_mode == 0) {
5029         // AC: saved number of threads should correspond to team's value in this
5030         // mode, can be bigger in mode 1, when hot team has threads in reserve
5031         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5032         hot_teams[level].hot_team_nth = new_nproc;
5033 #endif // KMP_NESTED_HOT_TEAMS
5034         /* release the extra threads we don't need any more */
5035         for (f = new_nproc; f < team->t.t_nproc; f++) {
5036           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5037           if (__kmp_tasking_mode != tskm_immediate_exec) {
5038             // When decreasing team size, threads no longer in the team should
5039             // unref task team.
5040             team->t.t_threads[f]->th.th_task_team = NULL;
5041           }
5042           __kmp_free_thread(team->t.t_threads[f]);
5043           team->t.t_threads[f] = NULL;
5044         }
5045 #if KMP_NESTED_HOT_TEAMS
5046       } // (__kmp_hot_teams_mode == 0)
5047       else {
5048         // When keeping extra threads in team, switch threads to wait on own
5049         // b_go flag
5050         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5051           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5052           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5053           for (int b = 0; b < bs_last_barrier; ++b) {
5054             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5055               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5056             }
5057             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5058           }
5059         }
5060       }
5061 #endif // KMP_NESTED_HOT_TEAMS
5062       team->t.t_nproc = new_nproc;
5063       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5064       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5065       __kmp_reinitialize_team(team, new_icvs,
5066                               root->r.r_uber_thread->th.th_ident);
5067 
5068       // Update remaining threads
5069       for (f = 0; f < new_nproc; ++f) {
5070         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5071       }
5072 
5073       // restore the current task state of the master thread: should be the
5074       // implicit task
5075       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5076                     team->t.t_threads[0], team));
5077 
5078       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5079 
5080 #ifdef KMP_DEBUG
5081       for (f = 0; f < team->t.t_nproc; f++) {
5082         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5083                          team->t.t_threads[f]->th.th_team_nproc ==
5084                              team->t.t_nproc);
5085       }
5086 #endif
5087 
5088       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5089 #if KMP_AFFINITY_SUPPORTED
5090       __kmp_partition_places(team);
5091 #endif
5092     } else { // team->t.t_nproc < new_nproc
5093 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5094       kmp_affin_mask_t *old_mask;
5095       if (KMP_AFFINITY_CAPABLE()) {
5096         KMP_CPU_ALLOC(old_mask);
5097       }
5098 #endif
5099 
5100       KA_TRACE(20,
5101                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5102                 new_nproc));
5103 
5104       team->t.t_size_changed = 1;
5105 
5106 #if KMP_NESTED_HOT_TEAMS
5107       int avail_threads = hot_teams[level].hot_team_nth;
5108       if (new_nproc < avail_threads)
5109         avail_threads = new_nproc;
5110       kmp_info_t **other_threads = team->t.t_threads;
5111       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5112         // Adjust barrier data of reserved threads (if any) of the team
5113         // Other data will be set in __kmp_initialize_info() below.
5114         int b;
5115         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5116         for (b = 0; b < bs_last_barrier; ++b) {
5117           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5118           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5119 #if USE_DEBUGGER
5120           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5121 #endif
5122         }
5123       }
5124       if (hot_teams[level].hot_team_nth >= new_nproc) {
5125         // we have all needed threads in reserve, no need to allocate any
5126         // this only possible in mode 1, cannot have reserved threads in mode 0
5127         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5128         team->t.t_nproc = new_nproc; // just get reserved threads involved
5129       } else {
5130         // we may have some threads in reserve, but not enough
5131         team->t.t_nproc =
5132             hot_teams[level]
5133                 .hot_team_nth; // get reserved threads involved if any
5134         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5135 #endif // KMP_NESTED_HOT_TEAMS
5136         if (team->t.t_max_nproc < new_nproc) {
5137           /* reallocate larger arrays */
5138           __kmp_reallocate_team_arrays(team, new_nproc);
5139           __kmp_reinitialize_team(team, new_icvs, NULL);
5140         }
5141 
5142 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5143         /* Temporarily set full mask for master thread before creation of
5144            workers. The reason is that workers inherit the affinity from master,
5145            so if a lot of workers are created on the single core quickly, they
5146            don't get a chance to set their own affinity for a long time. */
5147         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5148 #endif
5149 
5150         /* allocate new threads for the hot team */
5151         for (f = team->t.t_nproc; f < new_nproc; f++) {
5152           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5153           KMP_DEBUG_ASSERT(new_worker);
5154           team->t.t_threads[f] = new_worker;
5155 
5156           KA_TRACE(20,
5157                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5158                     "join=%llu, plain=%llu\n",
5159                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5160                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5161                     team->t.t_bar[bs_plain_barrier].b_arrived));
5162 
5163           { // Initialize barrier data for new threads.
5164             int b;
5165             kmp_balign_t *balign = new_worker->th.th_bar;
5166             for (b = 0; b < bs_last_barrier; ++b) {
5167               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5168               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5169                                KMP_BARRIER_PARENT_FLAG);
5170 #if USE_DEBUGGER
5171               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5172 #endif
5173             }
5174           }
5175         }
5176 
5177 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5178         if (KMP_AFFINITY_CAPABLE()) {
5179           /* Restore initial master thread's affinity mask */
5180           __kmp_set_system_affinity(old_mask, TRUE);
5181           KMP_CPU_FREE(old_mask);
5182         }
5183 #endif
5184 #if KMP_NESTED_HOT_TEAMS
5185       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5186 #endif // KMP_NESTED_HOT_TEAMS
5187       /* make sure everyone is syncronized */
5188       int old_nproc = team->t.t_nproc; // save old value and use to update only
5189       // new threads below
5190       __kmp_initialize_team(team, new_nproc, new_icvs,
5191                             root->r.r_uber_thread->th.th_ident);
5192 
5193       /* reinitialize the threads */
5194       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5195       for (f = 0; f < team->t.t_nproc; ++f)
5196         __kmp_initialize_info(team->t.t_threads[f], team, f,
5197                               __kmp_gtid_from_tid(f, team));
5198 
5199       if (level) { // set th_task_state for new threads in nested hot team
5200         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5201         // only need to set the th_task_state for the new threads. th_task_state
5202         // for master thread will not be accurate until after this in
5203         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5204         // correct value.
5205         for (f = old_nproc; f < team->t.t_nproc; ++f)
5206           team->t.t_threads[f]->th.th_task_state =
5207               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5208       } else { // set th_task_state for new threads in non-nested hot team
5209         int old_state =
5210             team->t.t_threads[0]->th.th_task_state; // copy master's state
5211         for (f = old_nproc; f < team->t.t_nproc; ++f)
5212           team->t.t_threads[f]->th.th_task_state = old_state;
5213       }
5214 
5215 #ifdef KMP_DEBUG
5216       for (f = 0; f < team->t.t_nproc; ++f) {
5217         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5218                          team->t.t_threads[f]->th.th_team_nproc ==
5219                              team->t.t_nproc);
5220       }
5221 #endif
5222 
5223       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5224 #if KMP_AFFINITY_SUPPORTED
5225       __kmp_partition_places(team);
5226 #endif
5227     } // Check changes in number of threads
5228 
5229     kmp_info_t *master = team->t.t_threads[0];
5230     if (master->th.th_teams_microtask) {
5231       for (f = 1; f < new_nproc; ++f) {
5232         // propagate teams construct specific info to workers
5233         kmp_info_t *thr = team->t.t_threads[f];
5234         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5235         thr->th.th_teams_level = master->th.th_teams_level;
5236         thr->th.th_teams_size = master->th.th_teams_size;
5237       }
5238     }
5239 #if KMP_NESTED_HOT_TEAMS
5240     if (level) {
5241       // Sync barrier state for nested hot teams, not needed for outermost hot
5242       // team.
5243       for (f = 1; f < new_nproc; ++f) {
5244         kmp_info_t *thr = team->t.t_threads[f];
5245         int b;
5246         kmp_balign_t *balign = thr->th.th_bar;
5247         for (b = 0; b < bs_last_barrier; ++b) {
5248           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5249           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5250 #if USE_DEBUGGER
5251           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5252 #endif
5253         }
5254       }
5255     }
5256 #endif // KMP_NESTED_HOT_TEAMS
5257 
5258     /* reallocate space for arguments if necessary */
5259     __kmp_alloc_argv_entries(argc, team, TRUE);
5260     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5261     // The hot team re-uses the previous task team,
5262     // if untouched during the previous release->gather phase.
5263 
5264     KF_TRACE(10, (" hot_team = %p\n", team));
5265 
5266 #if KMP_DEBUG
5267     if (__kmp_tasking_mode != tskm_immediate_exec) {
5268       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5269                     "task_team[1] = %p after reinit\n",
5270                     team->t.t_task_team[0], team->t.t_task_team[1]));
5271     }
5272 #endif
5273 
5274 #if OMPT_SUPPORT
5275     __ompt_team_assign_id(team, ompt_parallel_data);
5276 #endif
5277 
5278     KMP_MB();
5279 
5280     return team;
5281   }
5282 
5283   /* next, let's try to take one from the team pool */
5284   KMP_MB();
5285   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5286     /* TODO: consider resizing undersized teams instead of reaping them, now
5287        that we have a resizing mechanism */
5288     if (team->t.t_max_nproc >= max_nproc) {
5289       /* take this team from the team pool */
5290       __kmp_team_pool = team->t.t_next_pool;
5291 
5292       /* setup the team for fresh use */
5293       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5294 
5295       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5296                     "task_team[1] %p to NULL\n",
5297                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5298       team->t.t_task_team[0] = NULL;
5299       team->t.t_task_team[1] = NULL;
5300 
5301       /* reallocate space for arguments if necessary */
5302       __kmp_alloc_argv_entries(argc, team, TRUE);
5303       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5304 
5305       KA_TRACE(
5306           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5307                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5308       { // Initialize barrier data.
5309         int b;
5310         for (b = 0; b < bs_last_barrier; ++b) {
5311           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5312 #if USE_DEBUGGER
5313           team->t.t_bar[b].b_master_arrived = 0;
5314           team->t.t_bar[b].b_team_arrived = 0;
5315 #endif
5316         }
5317       }
5318 
5319       team->t.t_proc_bind = new_proc_bind;
5320 
5321       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5322                     team->t.t_id));
5323 
5324 #if OMPT_SUPPORT
5325       __ompt_team_assign_id(team, ompt_parallel_data);
5326 #endif
5327 
5328       KMP_MB();
5329 
5330       return team;
5331     }
5332 
5333     /* reap team if it is too small, then loop back and check the next one */
5334     // not sure if this is wise, but, will be redone during the hot-teams
5335     // rewrite.
5336     /* TODO: Use technique to find the right size hot-team, don't reap them */
5337     team = __kmp_reap_team(team);
5338     __kmp_team_pool = team;
5339   }
5340 
5341   /* nothing available in the pool, no matter, make a new team! */
5342   KMP_MB();
5343   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5344 
5345   /* and set it up */
5346   team->t.t_max_nproc = max_nproc;
5347   /* NOTE well, for some reason allocating one big buffer and dividing it up
5348      seems to really hurt performance a lot on the P4, so, let's not use this */
5349   __kmp_allocate_team_arrays(team, max_nproc);
5350 
5351   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5352   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5353 
5354   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5355                 "%p to NULL\n",
5356                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5357   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5358   // memory, no need to duplicate
5359   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5360   // memory, no need to duplicate
5361 
5362   if (__kmp_storage_map) {
5363     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5364   }
5365 
5366   /* allocate space for arguments */
5367   __kmp_alloc_argv_entries(argc, team, FALSE);
5368   team->t.t_argc = argc;
5369 
5370   KA_TRACE(20,
5371            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5372             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5373   { // Initialize barrier data.
5374     int b;
5375     for (b = 0; b < bs_last_barrier; ++b) {
5376       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5377 #if USE_DEBUGGER
5378       team->t.t_bar[b].b_master_arrived = 0;
5379       team->t.t_bar[b].b_team_arrived = 0;
5380 #endif
5381     }
5382   }
5383 
5384   team->t.t_proc_bind = new_proc_bind;
5385 
5386 #if OMPT_SUPPORT
5387   __ompt_team_assign_id(team, ompt_parallel_data);
5388   team->t.ompt_serialized_team_info = NULL;
5389 #endif
5390 
5391   KMP_MB();
5392 
5393   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5394                 team->t.t_id));
5395 
5396   return team;
5397 }
5398 
5399 /* TODO implement hot-teams at all levels */
5400 /* TODO implement lazy thread release on demand (disband request) */
5401 
5402 /* free the team.  return it to the team pool.  release all the threads
5403  * associated with it */
5404 void __kmp_free_team(kmp_root_t *root,
5405                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5406   int f;
5407   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5408                 team->t.t_id));
5409 
5410   /* verify state */
5411   KMP_DEBUG_ASSERT(root);
5412   KMP_DEBUG_ASSERT(team);
5413   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5414   KMP_DEBUG_ASSERT(team->t.t_threads);
5415 
5416   int use_hot_team = team == root->r.r_hot_team;
5417 #if KMP_NESTED_HOT_TEAMS
5418   int level;
5419   kmp_hot_team_ptr_t *hot_teams;
5420   if (master) {
5421     level = team->t.t_active_level - 1;
5422     if (master->th.th_teams_microtask) { // in teams construct?
5423       if (master->th.th_teams_size.nteams > 1) {
5424         ++level; // level was not increased in teams construct for
5425         // team_of_masters
5426       }
5427       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5428           master->th.th_teams_level == team->t.t_level) {
5429         ++level; // level was not increased in teams construct for
5430         // team_of_workers before the parallel
5431       } // team->t.t_level will be increased inside parallel
5432     }
5433     hot_teams = master->th.th_hot_teams;
5434     if (level < __kmp_hot_teams_max_level) {
5435       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5436       use_hot_team = 1;
5437     }
5438   }
5439 #endif // KMP_NESTED_HOT_TEAMS
5440 
5441   /* team is done working */
5442   TCW_SYNC_PTR(team->t.t_pkfn,
5443                NULL); // Important for Debugging Support Library.
5444 #if KMP_OS_WINDOWS
5445   team->t.t_copyin_counter = 0; // init counter for possible reuse
5446 #endif
5447   // Do not reset pointer to parent team to NULL for hot teams.
5448 
5449   /* if we are non-hot team, release our threads */
5450   if (!use_hot_team) {
5451     if (__kmp_tasking_mode != tskm_immediate_exec) {
5452       // Wait for threads to reach reapable state
5453       for (f = 1; f < team->t.t_nproc; ++f) {
5454         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5455         kmp_info_t *th = team->t.t_threads[f];
5456         volatile kmp_uint32 *state = &th->th.th_reap_state;
5457         while (*state != KMP_SAFE_TO_REAP) {
5458 #if KMP_OS_WINDOWS
5459           // On Windows a thread can be killed at any time, check this
5460           DWORD ecode;
5461           if (!__kmp_is_thread_alive(th, &ecode)) {
5462             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5463             break;
5464           }
5465 #endif
5466           // first check if thread is sleeping
5467           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5468           if (fl.is_sleeping())
5469             fl.resume(__kmp_gtid_from_thread(th));
5470           KMP_CPU_PAUSE();
5471         }
5472       }
5473 
5474       // Delete task teams
5475       int tt_idx;
5476       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5477         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5478         if (task_team != NULL) {
5479           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5480             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5481             team->t.t_threads[f]->th.th_task_team = NULL;
5482           }
5483           KA_TRACE(
5484               20,
5485               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5486                __kmp_get_gtid(), task_team, team->t.t_id));
5487 #if KMP_NESTED_HOT_TEAMS
5488           __kmp_free_task_team(master, task_team);
5489 #endif
5490           team->t.t_task_team[tt_idx] = NULL;
5491         }
5492       }
5493     }
5494 
5495     // Reset pointer to parent team only for non-hot teams.
5496     team->t.t_parent = NULL;
5497     team->t.t_level = 0;
5498     team->t.t_active_level = 0;
5499 
5500     /* free the worker threads */
5501     for (f = 1; f < team->t.t_nproc; ++f) {
5502       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5503       __kmp_free_thread(team->t.t_threads[f]);
5504       team->t.t_threads[f] = NULL;
5505     }
5506 
5507     /* put the team back in the team pool */
5508     /* TODO limit size of team pool, call reap_team if pool too large */
5509     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5510     __kmp_team_pool = (volatile kmp_team_t *)team;
5511   } else { // Check if team was created for the masters in a teams construct
5512     // See if first worker is a CG root
5513     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5514                      team->t.t_threads[1]->th.th_cg_roots);
5515     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5516       // Clean up the CG root nodes on workers so that this team can be re-used
5517       for (f = 1; f < team->t.t_nproc; ++f) {
5518         kmp_info_t *thr = team->t.t_threads[f];
5519         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5520                          thr->th.th_cg_roots->cg_root == thr);
5521         // Pop current CG root off list
5522         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5523         thr->th.th_cg_roots = tmp->up;
5524         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5525                        " up to node %p. cg_nthreads was %d\n",
5526                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5527         int i = tmp->cg_nthreads--;
5528         if (i == 1) {
5529           __kmp_free(tmp); // free CG if we are the last thread in it
5530         }
5531         // Restore current task's thread_limit from CG root
5532         if (thr->th.th_cg_roots)
5533           thr->th.th_current_task->td_icvs.thread_limit =
5534               thr->th.th_cg_roots->cg_thread_limit;
5535       }
5536     }
5537   }
5538 
5539   KMP_MB();
5540 }
5541 
5542 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5543 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5544   kmp_team_t *next_pool = team->t.t_next_pool;
5545 
5546   KMP_DEBUG_ASSERT(team);
5547   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5548   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5549   KMP_DEBUG_ASSERT(team->t.t_threads);
5550   KMP_DEBUG_ASSERT(team->t.t_argv);
5551 
5552   /* TODO clean the threads that are a part of this? */
5553 
5554   /* free stuff */
5555   __kmp_free_team_arrays(team);
5556   if (team->t.t_argv != &team->t.t_inline_argv[0])
5557     __kmp_free((void *)team->t.t_argv);
5558   __kmp_free(team);
5559 
5560   KMP_MB();
5561   return next_pool;
5562 }
5563 
5564 // Free the thread.  Don't reap it, just place it on the pool of available
5565 // threads.
5566 //
5567 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5568 // binding for the affinity mechanism to be useful.
5569 //
5570 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5571 // However, we want to avoid a potential performance problem by always
5572 // scanning through the list to find the correct point at which to insert
5573 // the thread (potential N**2 behavior).  To do this we keep track of the
5574 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5575 // With single-level parallelism, threads will always be added to the tail
5576 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5577 // parallelism, all bets are off and we may need to scan through the entire
5578 // free list.
5579 //
5580 // This change also has a potentially large performance benefit, for some
5581 // applications.  Previously, as threads were freed from the hot team, they
5582 // would be placed back on the free list in inverse order.  If the hot team
5583 // grew back to it's original size, then the freed thread would be placed
5584 // back on the hot team in reverse order.  This could cause bad cache
5585 // locality problems on programs where the size of the hot team regularly
5586 // grew and shrunk.
5587 //
5588 // Now, for single-level parallelism, the OMP tid is always == gtid.
5589 void __kmp_free_thread(kmp_info_t *this_th) {
5590   int gtid;
5591   kmp_info_t **scan;
5592 
5593   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5594                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5595 
5596   KMP_DEBUG_ASSERT(this_th);
5597 
5598   // When moving thread to pool, switch thread to wait on own b_go flag, and
5599   // uninitialized (NULL team).
5600   int b;
5601   kmp_balign_t *balign = this_th->th.th_bar;
5602   for (b = 0; b < bs_last_barrier; ++b) {
5603     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5604       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5605     balign[b].bb.team = NULL;
5606     balign[b].bb.leaf_kids = 0;
5607   }
5608   this_th->th.th_task_state = 0;
5609   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5610 
5611   /* put thread back on the free pool */
5612   TCW_PTR(this_th->th.th_team, NULL);
5613   TCW_PTR(this_th->th.th_root, NULL);
5614   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5615 
5616   while (this_th->th.th_cg_roots) {
5617     this_th->th.th_cg_roots->cg_nthreads--;
5618     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5619                    " %p of thread  %p to %d\n",
5620                    this_th, this_th->th.th_cg_roots,
5621                    this_th->th.th_cg_roots->cg_root,
5622                    this_th->th.th_cg_roots->cg_nthreads));
5623     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5624     if (tmp->cg_root == this_th) { // Thread is a cg_root
5625       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5626       KA_TRACE(
5627           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5628       this_th->th.th_cg_roots = tmp->up;
5629       __kmp_free(tmp);
5630     } else { // Worker thread
5631       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5632         __kmp_free(tmp);
5633       }
5634       this_th->th.th_cg_roots = NULL;
5635       break;
5636     }
5637   }
5638 
5639   /* If the implicit task assigned to this thread can be used by other threads
5640    * -> multiple threads can share the data and try to free the task at
5641    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5642    * with higher probability when hot team is disabled but can occurs even when
5643    * the hot team is enabled */
5644   __kmp_free_implicit_task(this_th);
5645   this_th->th.th_current_task = NULL;
5646 
5647   // If the __kmp_thread_pool_insert_pt is already past the new insert
5648   // point, then we need to re-scan the entire list.
5649   gtid = this_th->th.th_info.ds.ds_gtid;
5650   if (__kmp_thread_pool_insert_pt != NULL) {
5651     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5652     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5653       __kmp_thread_pool_insert_pt = NULL;
5654     }
5655   }
5656 
5657   // Scan down the list to find the place to insert the thread.
5658   // scan is the address of a link in the list, possibly the address of
5659   // __kmp_thread_pool itself.
5660   //
5661   // In the absence of nested parallelism, the for loop will have 0 iterations.
5662   if (__kmp_thread_pool_insert_pt != NULL) {
5663     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5664   } else {
5665     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5666   }
5667   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5668        scan = &((*scan)->th.th_next_pool))
5669     ;
5670 
5671   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5672   // to its address.
5673   TCW_PTR(this_th->th.th_next_pool, *scan);
5674   __kmp_thread_pool_insert_pt = *scan = this_th;
5675   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5676                    (this_th->th.th_info.ds.ds_gtid <
5677                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5678   TCW_4(this_th->th.th_in_pool, TRUE);
5679   __kmp_suspend_initialize_thread(this_th);
5680   __kmp_lock_suspend_mx(this_th);
5681   if (this_th->th.th_active == TRUE) {
5682     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5683     this_th->th.th_active_in_pool = TRUE;
5684   }
5685 #if KMP_DEBUG
5686   else {
5687     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5688   }
5689 #endif
5690   __kmp_unlock_suspend_mx(this_th);
5691 
5692   TCW_4(__kmp_nth, __kmp_nth - 1);
5693 
5694 #ifdef KMP_ADJUST_BLOCKTIME
5695   /* Adjust blocktime back to user setting or default if necessary */
5696   /* Middle initialization might never have occurred                */
5697   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5698     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5699     if (__kmp_nth <= __kmp_avail_proc) {
5700       __kmp_zero_bt = FALSE;
5701     }
5702   }
5703 #endif /* KMP_ADJUST_BLOCKTIME */
5704 
5705   KMP_MB();
5706 }
5707 
5708 /* ------------------------------------------------------------------------ */
5709 
5710 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5711   int gtid = this_thr->th.th_info.ds.ds_gtid;
5712   /*    void                 *stack_data;*/
5713   kmp_team_t **volatile pteam;
5714 
5715   KMP_MB();
5716   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5717 
5718   if (__kmp_env_consistency_check) {
5719     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5720   }
5721 
5722 #if OMPT_SUPPORT
5723   ompt_data_t *thread_data;
5724   if (ompt_enabled.enabled) {
5725     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5726     *thread_data = ompt_data_none;
5727 
5728     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5729     this_thr->th.ompt_thread_info.wait_id = 0;
5730     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5731     this_thr->th.ompt_thread_info.parallel_flags = 0;
5732     if (ompt_enabled.ompt_callback_thread_begin) {
5733       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5734           ompt_thread_worker, thread_data);
5735     }
5736     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5737   }
5738 #endif
5739 
5740   /* This is the place where threads wait for work */
5741   while (!TCR_4(__kmp_global.g.g_done)) {
5742     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5743     KMP_MB();
5744 
5745     /* wait for work to do */
5746     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5747 
5748     /* No tid yet since not part of a team */
5749     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5750 
5751 #if OMPT_SUPPORT
5752     if (ompt_enabled.enabled) {
5753       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5754     }
5755 #endif
5756 
5757     pteam = &this_thr->th.th_team;
5758 
5759     /* have we been allocated? */
5760     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5761       /* we were just woken up, so run our new task */
5762       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5763         int rc;
5764         KA_TRACE(20,
5765                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5766                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5767                   (*pteam)->t.t_pkfn));
5768 
5769         updateHWFPControl(*pteam);
5770 
5771 #if OMPT_SUPPORT
5772         if (ompt_enabled.enabled) {
5773           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5774         }
5775 #endif
5776 
5777         rc = (*pteam)->t.t_invoke(gtid);
5778         KMP_ASSERT(rc);
5779 
5780         KMP_MB();
5781         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5782                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5783                       (*pteam)->t.t_pkfn));
5784       }
5785 #if OMPT_SUPPORT
5786       if (ompt_enabled.enabled) {
5787         /* no frame set while outside task */
5788         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5789 
5790         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5791       }
5792 #endif
5793       /* join barrier after parallel region */
5794       __kmp_join_barrier(gtid);
5795     }
5796   }
5797   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5798 
5799 #if OMPT_SUPPORT
5800   if (ompt_enabled.ompt_callback_thread_end) {
5801     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5802   }
5803 #endif
5804 
5805   this_thr->th.th_task_team = NULL;
5806   /* run the destructors for the threadprivate data for this thread */
5807   __kmp_common_destroy_gtid(gtid);
5808 
5809   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5810   KMP_MB();
5811   return this_thr;
5812 }
5813 
5814 /* ------------------------------------------------------------------------ */
5815 
5816 void __kmp_internal_end_dest(void *specific_gtid) {
5817 #if KMP_COMPILER_ICC
5818 #pragma warning(push)
5819 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5820 // significant bits
5821 #endif
5822   // Make sure no significant bits are lost
5823   int gtid = (kmp_intptr_t)specific_gtid - 1;
5824 #if KMP_COMPILER_ICC
5825 #pragma warning(pop)
5826 #endif
5827 
5828   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5829   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5830    * this is because 0 is reserved for the nothing-stored case */
5831 
5832   /* josh: One reason for setting the gtid specific data even when it is being
5833      destroyed by pthread is to allow gtid lookup through thread specific data
5834      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5835      that gets executed in the call to __kmp_internal_end_thread, actually
5836      gets the gtid through the thread specific data.  Setting it here seems
5837      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5838      to run smoothly.
5839      todo: get rid of this after we remove the dependence on
5840      __kmp_gtid_get_specific  */
5841   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5842     __kmp_gtid_set_specific(gtid);
5843 #ifdef KMP_TDATA_GTID
5844   __kmp_gtid = gtid;
5845 #endif
5846   __kmp_internal_end_thread(gtid);
5847 }
5848 
5849 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5850 
5851 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5852   __kmp_internal_end_atexit();
5853 }
5854 
5855 #endif
5856 
5857 /* [Windows] josh: when the atexit handler is called, there may still be more
5858    than one thread alive */
5859 void __kmp_internal_end_atexit(void) {
5860   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5861   /* [Windows]
5862      josh: ideally, we want to completely shutdown the library in this atexit
5863      handler, but stat code that depends on thread specific data for gtid fails
5864      because that data becomes unavailable at some point during the shutdown, so
5865      we call __kmp_internal_end_thread instead. We should eventually remove the
5866      dependency on __kmp_get_specific_gtid in the stat code and use
5867      __kmp_internal_end_library to cleanly shutdown the library.
5868 
5869      // TODO: Can some of this comment about GVS be removed?
5870      I suspect that the offending stat code is executed when the calling thread
5871      tries to clean up a dead root thread's data structures, resulting in GVS
5872      code trying to close the GVS structures for that thread, but since the stat
5873      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5874      the calling thread is cleaning up itself instead of another thread, it get
5875      confused. This happens because allowing a thread to unregister and cleanup
5876      another thread is a recent modification for addressing an issue.
5877      Based on the current design (20050722), a thread may end up
5878      trying to unregister another thread only if thread death does not trigger
5879      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5880      thread specific data destructor function to detect thread death. For
5881      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5882      is nothing.  Thus, the workaround is applicable only for Windows static
5883      stat library. */
5884   __kmp_internal_end_library(-1);
5885 #if KMP_OS_WINDOWS
5886   __kmp_close_console();
5887 #endif
5888 }
5889 
5890 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5891   // It is assumed __kmp_forkjoin_lock is acquired.
5892 
5893   int gtid;
5894 
5895   KMP_DEBUG_ASSERT(thread != NULL);
5896 
5897   gtid = thread->th.th_info.ds.ds_gtid;
5898 
5899   if (!is_root) {
5900     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5901       /* Assume the threads are at the fork barrier here */
5902       KA_TRACE(
5903           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5904                gtid));
5905       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5906        * (GEH) */
5907       ANNOTATE_HAPPENS_BEFORE(thread);
5908       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5909       __kmp_release_64(&flag);
5910     }
5911 
5912     // Terminate OS thread.
5913     __kmp_reap_worker(thread);
5914 
5915     // The thread was killed asynchronously.  If it was actively
5916     // spinning in the thread pool, decrement the global count.
5917     //
5918     // There is a small timing hole here - if the worker thread was just waking
5919     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5920     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5921     // the global counter might not get updated.
5922     //
5923     // Currently, this can only happen as the library is unloaded,
5924     // so there are no harmful side effects.
5925     if (thread->th.th_active_in_pool) {
5926       thread->th.th_active_in_pool = FALSE;
5927       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5928       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5929     }
5930   }
5931 
5932   __kmp_free_implicit_task(thread);
5933 
5934 // Free the fast memory for tasking
5935 #if USE_FAST_MEMORY
5936   __kmp_free_fast_memory(thread);
5937 #endif /* USE_FAST_MEMORY */
5938 
5939   __kmp_suspend_uninitialize_thread(thread);
5940 
5941   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5942   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5943 
5944   --__kmp_all_nth;
5945 // __kmp_nth was decremented when thread is added to the pool.
5946 
5947 #ifdef KMP_ADJUST_BLOCKTIME
5948   /* Adjust blocktime back to user setting or default if necessary */
5949   /* Middle initialization might never have occurred                */
5950   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5951     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5952     if (__kmp_nth <= __kmp_avail_proc) {
5953       __kmp_zero_bt = FALSE;
5954     }
5955   }
5956 #endif /* KMP_ADJUST_BLOCKTIME */
5957 
5958   /* free the memory being used */
5959   if (__kmp_env_consistency_check) {
5960     if (thread->th.th_cons) {
5961       __kmp_free_cons_stack(thread->th.th_cons);
5962       thread->th.th_cons = NULL;
5963     }
5964   }
5965 
5966   if (thread->th.th_pri_common != NULL) {
5967     __kmp_free(thread->th.th_pri_common);
5968     thread->th.th_pri_common = NULL;
5969   }
5970 
5971   if (thread->th.th_task_state_memo_stack != NULL) {
5972     __kmp_free(thread->th.th_task_state_memo_stack);
5973     thread->th.th_task_state_memo_stack = NULL;
5974   }
5975 
5976 #if KMP_USE_BGET
5977   if (thread->th.th_local.bget_data != NULL) {
5978     __kmp_finalize_bget(thread);
5979   }
5980 #endif
5981 
5982 #if KMP_AFFINITY_SUPPORTED
5983   if (thread->th.th_affin_mask != NULL) {
5984     KMP_CPU_FREE(thread->th.th_affin_mask);
5985     thread->th.th_affin_mask = NULL;
5986   }
5987 #endif /* KMP_AFFINITY_SUPPORTED */
5988 
5989 #if KMP_USE_HIER_SCHED
5990   if (thread->th.th_hier_bar_data != NULL) {
5991     __kmp_free(thread->th.th_hier_bar_data);
5992     thread->th.th_hier_bar_data = NULL;
5993   }
5994 #endif
5995 
5996   __kmp_reap_team(thread->th.th_serial_team);
5997   thread->th.th_serial_team = NULL;
5998   __kmp_free(thread);
5999 
6000   KMP_MB();
6001 
6002 } // __kmp_reap_thread
6003 
6004 static void __kmp_internal_end(void) {
6005   int i;
6006 
6007   /* First, unregister the library */
6008   __kmp_unregister_library();
6009 
6010 #if KMP_OS_WINDOWS
6011   /* In Win static library, we can't tell when a root actually dies, so we
6012      reclaim the data structures for any root threads that have died but not
6013      unregistered themselves, in order to shut down cleanly.
6014      In Win dynamic library we also can't tell when a thread dies.  */
6015   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6016 // dead roots
6017 #endif
6018 
6019   for (i = 0; i < __kmp_threads_capacity; i++)
6020     if (__kmp_root[i])
6021       if (__kmp_root[i]->r.r_active)
6022         break;
6023   KMP_MB(); /* Flush all pending memory write invalidates.  */
6024   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6025 
6026   if (i < __kmp_threads_capacity) {
6027 #if KMP_USE_MONITOR
6028     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6029     KMP_MB(); /* Flush all pending memory write invalidates.  */
6030 
6031     // Need to check that monitor was initialized before reaping it. If we are
6032     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6033     // __kmp_monitor will appear to contain valid data, but it is only valid in
6034     // the parent process, not the child.
6035     // New behavior (201008): instead of keying off of the flag
6036     // __kmp_init_parallel, the monitor thread creation is keyed off
6037     // of the new flag __kmp_init_monitor.
6038     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6039     if (TCR_4(__kmp_init_monitor)) {
6040       __kmp_reap_monitor(&__kmp_monitor);
6041       TCW_4(__kmp_init_monitor, 0);
6042     }
6043     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6044     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6045 #endif // KMP_USE_MONITOR
6046   } else {
6047 /* TODO move this to cleanup code */
6048 #ifdef KMP_DEBUG
6049     /* make sure that everything has properly ended */
6050     for (i = 0; i < __kmp_threads_capacity; i++) {
6051       if (__kmp_root[i]) {
6052         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6053         //                    there can be uber threads alive here
6054         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6055       }
6056     }
6057 #endif
6058 
6059     KMP_MB();
6060 
6061     // Reap the worker threads.
6062     // This is valid for now, but be careful if threads are reaped sooner.
6063     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6064       // Get the next thread from the pool.
6065       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6066       __kmp_thread_pool = thread->th.th_next_pool;
6067       // Reap it.
6068       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6069       thread->th.th_next_pool = NULL;
6070       thread->th.th_in_pool = FALSE;
6071       __kmp_reap_thread(thread, 0);
6072     }
6073     __kmp_thread_pool_insert_pt = NULL;
6074 
6075     // Reap teams.
6076     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6077       // Get the next team from the pool.
6078       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6079       __kmp_team_pool = team->t.t_next_pool;
6080       // Reap it.
6081       team->t.t_next_pool = NULL;
6082       __kmp_reap_team(team);
6083     }
6084 
6085     __kmp_reap_task_teams();
6086 
6087 #if KMP_OS_UNIX
6088     // Threads that are not reaped should not access any resources since they
6089     // are going to be deallocated soon, so the shutdown sequence should wait
6090     // until all threads either exit the final spin-waiting loop or begin
6091     // sleeping after the given blocktime.
6092     for (i = 0; i < __kmp_threads_capacity; i++) {
6093       kmp_info_t *thr = __kmp_threads[i];
6094       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6095         KMP_CPU_PAUSE();
6096     }
6097 #endif
6098 
6099     for (i = 0; i < __kmp_threads_capacity; ++i) {
6100       // TBD: Add some checking...
6101       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6102     }
6103 
6104     /* Make sure all threadprivate destructors get run by joining with all
6105        worker threads before resetting this flag */
6106     TCW_SYNC_4(__kmp_init_common, FALSE);
6107 
6108     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6109     KMP_MB();
6110 
6111 #if KMP_USE_MONITOR
6112     // See note above: One of the possible fixes for CQ138434 / CQ140126
6113     //
6114     // FIXME: push both code fragments down and CSE them?
6115     // push them into __kmp_cleanup() ?
6116     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6117     if (TCR_4(__kmp_init_monitor)) {
6118       __kmp_reap_monitor(&__kmp_monitor);
6119       TCW_4(__kmp_init_monitor, 0);
6120     }
6121     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6122     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6123 #endif
6124   } /* else !__kmp_global.t_active */
6125   TCW_4(__kmp_init_gtid, FALSE);
6126   KMP_MB(); /* Flush all pending memory write invalidates.  */
6127 
6128   __kmp_cleanup();
6129 #if OMPT_SUPPORT
6130   ompt_fini();
6131 #endif
6132 }
6133 
6134 void __kmp_internal_end_library(int gtid_req) {
6135   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6136   /* this shouldn't be a race condition because __kmp_internal_end() is the
6137      only place to clear __kmp_serial_init */
6138   /* we'll check this later too, after we get the lock */
6139   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6140   // redundant, because the next check will work in any case.
6141   if (__kmp_global.g.g_abort) {
6142     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6143     /* TODO abort? */
6144     return;
6145   }
6146   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6147     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6148     return;
6149   }
6150 
6151   KMP_MB(); /* Flush all pending memory write invalidates.  */
6152   /* find out who we are and what we should do */
6153   {
6154     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6155     KA_TRACE(
6156         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6157     if (gtid == KMP_GTID_SHUTDOWN) {
6158       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6159                     "already shutdown\n"));
6160       return;
6161     } else if (gtid == KMP_GTID_MONITOR) {
6162       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6163                     "registered, or system shutdown\n"));
6164       return;
6165     } else if (gtid == KMP_GTID_DNE) {
6166       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6167                     "shutdown\n"));
6168       /* we don't know who we are, but we may still shutdown the library */
6169     } else if (KMP_UBER_GTID(gtid)) {
6170       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6171       if (__kmp_root[gtid]->r.r_active) {
6172         __kmp_global.g.g_abort = -1;
6173         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6174         __kmp_unregister_library();
6175         KA_TRACE(10,
6176                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6177                   gtid));
6178         return;
6179       } else {
6180         KA_TRACE(
6181             10,
6182             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6183         __kmp_unregister_root_current_thread(gtid);
6184       }
6185     } else {
6186 /* worker threads may call this function through the atexit handler, if they
6187  * call exit() */
6188 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6189    TODO: do a thorough shutdown instead */
6190 #ifdef DUMP_DEBUG_ON_EXIT
6191       if (__kmp_debug_buf)
6192         __kmp_dump_debug_buffer();
6193 #endif
6194       // added unregister library call here when we switch to shm linux
6195       // if we don't, it will leave lots of files in /dev/shm
6196       // cleanup shared memory file before exiting.
6197       __kmp_unregister_library();
6198       return;
6199     }
6200   }
6201   /* synchronize the termination process */
6202   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6203 
6204   /* have we already finished */
6205   if (__kmp_global.g.g_abort) {
6206     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6207     /* TODO abort? */
6208     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6209     return;
6210   }
6211   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6212     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6213     return;
6214   }
6215 
6216   /* We need this lock to enforce mutex between this reading of
6217      __kmp_threads_capacity and the writing by __kmp_register_root.
6218      Alternatively, we can use a counter of roots that is atomically updated by
6219      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6220      __kmp_internal_end_*.  */
6221   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6222 
6223   /* now we can safely conduct the actual termination */
6224   __kmp_internal_end();
6225 
6226   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6227   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6228 
6229   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6230 
6231 #ifdef DUMP_DEBUG_ON_EXIT
6232   if (__kmp_debug_buf)
6233     __kmp_dump_debug_buffer();
6234 #endif
6235 
6236 #if KMP_OS_WINDOWS
6237   __kmp_close_console();
6238 #endif
6239 
6240   __kmp_fini_allocator();
6241 
6242 } // __kmp_internal_end_library
6243 
6244 void __kmp_internal_end_thread(int gtid_req) {
6245   int i;
6246 
6247   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6248   /* this shouldn't be a race condition because __kmp_internal_end() is the
6249    * only place to clear __kmp_serial_init */
6250   /* we'll check this later too, after we get the lock */
6251   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6252   // redundant, because the next check will work in any case.
6253   if (__kmp_global.g.g_abort) {
6254     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6255     /* TODO abort? */
6256     return;
6257   }
6258   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6259     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6260     return;
6261   }
6262 
6263   KMP_MB(); /* Flush all pending memory write invalidates.  */
6264 
6265   /* find out who we are and what we should do */
6266   {
6267     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6268     KA_TRACE(10,
6269              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6270     if (gtid == KMP_GTID_SHUTDOWN) {
6271       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6272                     "already shutdown\n"));
6273       return;
6274     } else if (gtid == KMP_GTID_MONITOR) {
6275       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6276                     "registered, or system shutdown\n"));
6277       return;
6278     } else if (gtid == KMP_GTID_DNE) {
6279       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6280                     "shutdown\n"));
6281       return;
6282       /* we don't know who we are */
6283     } else if (KMP_UBER_GTID(gtid)) {
6284       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6285       if (__kmp_root[gtid]->r.r_active) {
6286         __kmp_global.g.g_abort = -1;
6287         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6288         KA_TRACE(10,
6289                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6290                   gtid));
6291         return;
6292       } else {
6293         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6294                       gtid));
6295         __kmp_unregister_root_current_thread(gtid);
6296       }
6297     } else {
6298       /* just a worker thread, let's leave */
6299       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6300 
6301       if (gtid >= 0) {
6302         __kmp_threads[gtid]->th.th_task_team = NULL;
6303       }
6304 
6305       KA_TRACE(10,
6306                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6307                 gtid));
6308       return;
6309     }
6310   }
6311 #if KMP_DYNAMIC_LIB
6312   if (__kmp_pause_status != kmp_hard_paused)
6313   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6314   // because we will better shutdown later in the library destructor.
6315   {
6316     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6317     return;
6318   }
6319 #endif
6320   /* synchronize the termination process */
6321   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6322 
6323   /* have we already finished */
6324   if (__kmp_global.g.g_abort) {
6325     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6326     /* TODO abort? */
6327     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6328     return;
6329   }
6330   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6331     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6332     return;
6333   }
6334 
6335   /* We need this lock to enforce mutex between this reading of
6336      __kmp_threads_capacity and the writing by __kmp_register_root.
6337      Alternatively, we can use a counter of roots that is atomically updated by
6338      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6339      __kmp_internal_end_*.  */
6340 
6341   /* should we finish the run-time?  are all siblings done? */
6342   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6343 
6344   for (i = 0; i < __kmp_threads_capacity; ++i) {
6345     if (KMP_UBER_GTID(i)) {
6346       KA_TRACE(
6347           10,
6348           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6349       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6350       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6351       return;
6352     }
6353   }
6354 
6355   /* now we can safely conduct the actual termination */
6356 
6357   __kmp_internal_end();
6358 
6359   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6360   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6361 
6362   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6363 
6364 #ifdef DUMP_DEBUG_ON_EXIT
6365   if (__kmp_debug_buf)
6366     __kmp_dump_debug_buffer();
6367 #endif
6368 } // __kmp_internal_end_thread
6369 
6370 // -----------------------------------------------------------------------------
6371 // Library registration stuff.
6372 
6373 static long __kmp_registration_flag = 0;
6374 // Random value used to indicate library initialization.
6375 static char *__kmp_registration_str = NULL;
6376 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6377 
6378 static inline char *__kmp_reg_status_name() {
6379   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6380      each thread. If registration and unregistration go in different threads
6381      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6382      env var can not be found, because the name will contain different pid. */
6383   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6384 } // __kmp_reg_status_get
6385 
6386 void __kmp_register_library_startup(void) {
6387 
6388   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6389   int done = 0;
6390   union {
6391     double dtime;
6392     long ltime;
6393   } time;
6394 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6395   __kmp_initialize_system_tick();
6396 #endif
6397   __kmp_read_system_time(&time.dtime);
6398   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6399   __kmp_registration_str =
6400       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6401                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6402 
6403   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6404                 __kmp_registration_str));
6405 
6406   while (!done) {
6407 
6408     char *value = NULL; // Actual value of the environment variable.
6409 
6410 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6411     char *shm_name = __kmp_str_format("/%s", name);
6412     int shm_preexist = 0;
6413     char *data1;
6414     int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6415     if ((fd1 == -1) && (errno == EEXIST)) {
6416       // file didn't open because it already exists.
6417       // try opening existing file
6418       fd1 = shm_open(shm_name, O_RDWR, 0666);
6419       if (fd1 == -1) { // file didn't open
6420         // error out here
6421         __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6422                     __kmp_msg_null);
6423       } else {
6424         // able to open existing file
6425         shm_preexist = 1;
6426       }
6427     } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6428       // already exists.
6429       // error out here.
6430       __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6431                   __kmp_msg_null);
6432     }
6433     if (shm_preexist == 0) {
6434       // we created SHM now set size
6435       if (ftruncate(fd1, SHM_SIZE) == -1) {
6436         // error occured setting size;
6437         __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6438                     KMP_ERR(errno), __kmp_msg_null);
6439       }
6440     }
6441     data1 =
6442         (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6443     if (data1 == MAP_FAILED) {
6444       // failed to map shared memory
6445       __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6446                   __kmp_msg_null);
6447     }
6448     if (shm_preexist == 0) { // set data to SHM, set value
6449       KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6450     }
6451     // Read value from either what we just wrote or existing file.
6452     value = __kmp_str_format("%s", data1); // read value from SHM
6453     munmap(data1, SHM_SIZE);
6454     close(fd1);
6455 #else // Windows and unix with static library
6456     // Set environment variable, but do not overwrite if it is exist.
6457     __kmp_env_set(name, __kmp_registration_str, 0);
6458     // read value to see if it got set
6459     value = __kmp_env_get(name);
6460 #endif
6461 
6462     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6463       done = 1; // Ok, environment variable set successfully, exit the loop.
6464     } else {
6465       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6466       // Check whether it alive or dead.
6467       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6468       char *tail = value;
6469       char *flag_addr_str = NULL;
6470       char *flag_val_str = NULL;
6471       char const *file_name = NULL;
6472       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6473       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6474       file_name = tail;
6475       if (tail != NULL) {
6476         long *flag_addr = 0;
6477         long flag_val = 0;
6478         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6479         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6480         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6481           // First, check whether environment-encoded address is mapped into
6482           // addr space.
6483           // If so, dereference it to see if it still has the right value.
6484           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6485             neighbor = 1;
6486           } else {
6487             // If not, then we know the other copy of the library is no longer
6488             // running.
6489             neighbor = 2;
6490           }
6491         }
6492       }
6493       switch (neighbor) {
6494       case 0: // Cannot parse environment variable -- neighbor status unknown.
6495         // Assume it is the incompatible format of future version of the
6496         // library. Assume the other library is alive.
6497         // WARN( ... ); // TODO: Issue a warning.
6498         file_name = "unknown library";
6499         KMP_FALLTHROUGH();
6500       // Attention! Falling to the next case. That's intentional.
6501       case 1: { // Neighbor is alive.
6502         // Check it is allowed.
6503         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6504         if (!__kmp_str_match_true(duplicate_ok)) {
6505           // That's not allowed. Issue fatal error.
6506           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6507                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6508         }
6509         KMP_INTERNAL_FREE(duplicate_ok);
6510         __kmp_duplicate_library_ok = 1;
6511         done = 1; // Exit the loop.
6512       } break;
6513       case 2: { // Neighbor is dead.
6514 
6515 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6516         // close shared memory.
6517         shm_unlink(shm_name); // this removes file in /dev/shm
6518 #else
6519         // Clear the variable and try to register library again.
6520         __kmp_env_unset(name);
6521 #endif
6522       } break;
6523       default: { KMP_DEBUG_ASSERT(0); } break;
6524       }
6525     }
6526     KMP_INTERNAL_FREE((void *)value);
6527 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6528     KMP_INTERNAL_FREE((void *)shm_name);
6529 #endif
6530   } // while
6531   KMP_INTERNAL_FREE((void *)name);
6532 
6533 } // func __kmp_register_library_startup
6534 
6535 void __kmp_unregister_library(void) {
6536 
6537   char *name = __kmp_reg_status_name();
6538   char *value = NULL;
6539 
6540 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6541   char *shm_name = __kmp_str_format("/%s", name);
6542   int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6543   if (fd1 == -1) {
6544     // file did not open. return.
6545     return;
6546   }
6547   char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6548   if (data1 != MAP_FAILED) {
6549     value = __kmp_str_format("%s", data1); // read value from SHM
6550     munmap(data1, SHM_SIZE);
6551   }
6552   close(fd1);
6553 #else
6554   value = __kmp_env_get(name);
6555 #endif
6556 
6557   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6558   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6559   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6560 //  Ok, this is our variable. Delete it.
6561 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6562     shm_unlink(shm_name); // this removes file in /dev/shm
6563 #else
6564     __kmp_env_unset(name);
6565 #endif
6566   }
6567 
6568 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6569   KMP_INTERNAL_FREE(shm_name);
6570 #endif
6571 
6572   KMP_INTERNAL_FREE(__kmp_registration_str);
6573   KMP_INTERNAL_FREE(value);
6574   KMP_INTERNAL_FREE(name);
6575 
6576   __kmp_registration_flag = 0;
6577   __kmp_registration_str = NULL;
6578 
6579 } // __kmp_unregister_library
6580 
6581 // End of Library registration stuff.
6582 // -----------------------------------------------------------------------------
6583 
6584 #if KMP_MIC_SUPPORTED
6585 
6586 static void __kmp_check_mic_type() {
6587   kmp_cpuid_t cpuid_state = {0};
6588   kmp_cpuid_t *cs_p = &cpuid_state;
6589   __kmp_x86_cpuid(1, 0, cs_p);
6590   // We don't support mic1 at the moment
6591   if ((cs_p->eax & 0xff0) == 0xB10) {
6592     __kmp_mic_type = mic2;
6593   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6594     __kmp_mic_type = mic3;
6595   } else {
6596     __kmp_mic_type = non_mic;
6597   }
6598 }
6599 
6600 #endif /* KMP_MIC_SUPPORTED */
6601 
6602 static void __kmp_do_serial_initialize(void) {
6603   int i, gtid;
6604   int size;
6605 
6606   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6607 
6608   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6609   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6610   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6611   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6612   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6613 
6614 #if OMPT_SUPPORT
6615   ompt_pre_init();
6616 #endif
6617 
6618   __kmp_validate_locks();
6619 
6620   /* Initialize internal memory allocator */
6621   __kmp_init_allocator();
6622 
6623   /* Register the library startup via an environment variable and check to see
6624      whether another copy of the library is already registered. */
6625 
6626   __kmp_register_library_startup();
6627 
6628   /* TODO reinitialization of library */
6629   if (TCR_4(__kmp_global.g.g_done)) {
6630     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6631   }
6632 
6633   __kmp_global.g.g_abort = 0;
6634   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6635 
6636 /* initialize the locks */
6637 #if KMP_USE_ADAPTIVE_LOCKS
6638 #if KMP_DEBUG_ADAPTIVE_LOCKS
6639   __kmp_init_speculative_stats();
6640 #endif
6641 #endif
6642 #if KMP_STATS_ENABLED
6643   __kmp_stats_init();
6644 #endif
6645   __kmp_init_lock(&__kmp_global_lock);
6646   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6647   __kmp_init_lock(&__kmp_debug_lock);
6648   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6649   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6650   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6651   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6652   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6653   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6654   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6655   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6656   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6657   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6658   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6659   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6660   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6661   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6662   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6663 #if KMP_USE_MONITOR
6664   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6665 #endif
6666   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6667 
6668   /* conduct initialization and initial setup of configuration */
6669 
6670   __kmp_runtime_initialize();
6671 
6672 #if KMP_MIC_SUPPORTED
6673   __kmp_check_mic_type();
6674 #endif
6675 
6676 // Some global variable initialization moved here from kmp_env_initialize()
6677 #ifdef KMP_DEBUG
6678   kmp_diag = 0;
6679 #endif
6680   __kmp_abort_delay = 0;
6681 
6682   // From __kmp_init_dflt_team_nth()
6683   /* assume the entire machine will be used */
6684   __kmp_dflt_team_nth_ub = __kmp_xproc;
6685   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6686     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6687   }
6688   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6689     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6690   }
6691   __kmp_max_nth = __kmp_sys_max_nth;
6692   __kmp_cg_max_nth = __kmp_sys_max_nth;
6693   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6694   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6695     __kmp_teams_max_nth = __kmp_sys_max_nth;
6696   }
6697 
6698   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6699   // part
6700   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6701 #if KMP_USE_MONITOR
6702   __kmp_monitor_wakeups =
6703       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6704   __kmp_bt_intervals =
6705       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6706 #endif
6707   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6708   __kmp_library = library_throughput;
6709   // From KMP_SCHEDULE initialization
6710   __kmp_static = kmp_sch_static_balanced;
6711 // AC: do not use analytical here, because it is non-monotonous
6712 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6713 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6714 // need to repeat assignment
6715 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6716 // bit control and barrier method control parts
6717 #if KMP_FAST_REDUCTION_BARRIER
6718 #define kmp_reduction_barrier_gather_bb ((int)1)
6719 #define kmp_reduction_barrier_release_bb ((int)1)
6720 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6721 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6722 #endif // KMP_FAST_REDUCTION_BARRIER
6723   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6724     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6725     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6726     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6727     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6728 #if KMP_FAST_REDUCTION_BARRIER
6729     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6730       // lin_64 ): hyper,1
6731       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6732       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6733       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6734       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6735     }
6736 #endif // KMP_FAST_REDUCTION_BARRIER
6737   }
6738 #if KMP_FAST_REDUCTION_BARRIER
6739 #undef kmp_reduction_barrier_release_pat
6740 #undef kmp_reduction_barrier_gather_pat
6741 #undef kmp_reduction_barrier_release_bb
6742 #undef kmp_reduction_barrier_gather_bb
6743 #endif // KMP_FAST_REDUCTION_BARRIER
6744 #if KMP_MIC_SUPPORTED
6745   if (__kmp_mic_type == mic2) { // KNC
6746     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6747     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6748     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6749         1; // forkjoin release
6750     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6751     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6752   }
6753 #if KMP_FAST_REDUCTION_BARRIER
6754   if (__kmp_mic_type == mic2) { // KNC
6755     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6756     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6757   }
6758 #endif // KMP_FAST_REDUCTION_BARRIER
6759 #endif // KMP_MIC_SUPPORTED
6760 
6761 // From KMP_CHECKS initialization
6762 #ifdef KMP_DEBUG
6763   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6764 #else
6765   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6766 #endif
6767 
6768   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6769   __kmp_foreign_tp = TRUE;
6770 
6771   __kmp_global.g.g_dynamic = FALSE;
6772   __kmp_global.g.g_dynamic_mode = dynamic_default;
6773 
6774   __kmp_env_initialize(NULL);
6775 
6776 // Print all messages in message catalog for testing purposes.
6777 #ifdef KMP_DEBUG
6778   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6779   if (__kmp_str_match_true(val)) {
6780     kmp_str_buf_t buffer;
6781     __kmp_str_buf_init(&buffer);
6782     __kmp_i18n_dump_catalog(&buffer);
6783     __kmp_printf("%s", buffer.str);
6784     __kmp_str_buf_free(&buffer);
6785   }
6786   __kmp_env_free(&val);
6787 #endif
6788 
6789   __kmp_threads_capacity =
6790       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6791   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6792   __kmp_tp_capacity = __kmp_default_tp_capacity(
6793       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6794 
6795   // If the library is shut down properly, both pools must be NULL. Just in
6796   // case, set them to NULL -- some memory may leak, but subsequent code will
6797   // work even if pools are not freed.
6798   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6799   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6800   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6801   __kmp_thread_pool = NULL;
6802   __kmp_thread_pool_insert_pt = NULL;
6803   __kmp_team_pool = NULL;
6804 
6805   /* Allocate all of the variable sized records */
6806   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6807    * expandable */
6808   /* Since allocation is cache-aligned, just add extra padding at the end */
6809   size =
6810       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6811       CACHE_LINE;
6812   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6813   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6814                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6815 
6816   /* init thread counts */
6817   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6818                    0); // Asserts fail if the library is reinitializing and
6819   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6820   __kmp_all_nth = 0;
6821   __kmp_nth = 0;
6822 
6823   /* setup the uber master thread and hierarchy */
6824   gtid = __kmp_register_root(TRUE);
6825   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6826   KMP_ASSERT(KMP_UBER_GTID(gtid));
6827   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6828 
6829   KMP_MB(); /* Flush all pending memory write invalidates.  */
6830 
6831   __kmp_common_initialize();
6832 
6833 #if KMP_OS_UNIX
6834   /* invoke the child fork handler */
6835   __kmp_register_atfork();
6836 #endif
6837 
6838 #if !KMP_DYNAMIC_LIB
6839   {
6840     /* Invoke the exit handler when the program finishes, only for static
6841        library. For dynamic library, we already have _fini and DllMain. */
6842     int rc = atexit(__kmp_internal_end_atexit);
6843     if (rc != 0) {
6844       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6845                   __kmp_msg_null);
6846     }
6847   }
6848 #endif
6849 
6850 #if KMP_HANDLE_SIGNALS
6851 #if KMP_OS_UNIX
6852   /* NOTE: make sure that this is called before the user installs their own
6853      signal handlers so that the user handlers are called first. this way they
6854      can return false, not call our handler, avoid terminating the library, and
6855      continue execution where they left off. */
6856   __kmp_install_signals(FALSE);
6857 #endif /* KMP_OS_UNIX */
6858 #if KMP_OS_WINDOWS
6859   __kmp_install_signals(TRUE);
6860 #endif /* KMP_OS_WINDOWS */
6861 #endif
6862 
6863   /* we have finished the serial initialization */
6864   __kmp_init_counter++;
6865 
6866   __kmp_init_serial = TRUE;
6867 
6868   if (__kmp_settings) {
6869     __kmp_env_print();
6870   }
6871 
6872   if (__kmp_display_env || __kmp_display_env_verbose) {
6873     __kmp_env_print_2();
6874   }
6875 
6876 #if OMPT_SUPPORT
6877   ompt_post_init();
6878 #endif
6879 
6880   KMP_MB();
6881 
6882   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6883 }
6884 
6885 void __kmp_serial_initialize(void) {
6886   if (__kmp_init_serial) {
6887     return;
6888   }
6889   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6890   if (__kmp_init_serial) {
6891     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6892     return;
6893   }
6894   __kmp_do_serial_initialize();
6895   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6896 }
6897 
6898 static void __kmp_do_middle_initialize(void) {
6899   int i, j;
6900   int prev_dflt_team_nth;
6901 
6902   if (!__kmp_init_serial) {
6903     __kmp_do_serial_initialize();
6904   }
6905 
6906   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6907 
6908   // Save the previous value for the __kmp_dflt_team_nth so that
6909   // we can avoid some reinitialization if it hasn't changed.
6910   prev_dflt_team_nth = __kmp_dflt_team_nth;
6911 
6912 #if KMP_AFFINITY_SUPPORTED
6913   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6914   // number of cores on the machine.
6915   __kmp_affinity_initialize();
6916 
6917   // Run through the __kmp_threads array and set the affinity mask
6918   // for each root thread that is currently registered with the RTL.
6919   for (i = 0; i < __kmp_threads_capacity; i++) {
6920     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6921       __kmp_affinity_set_init_mask(i, TRUE);
6922     }
6923   }
6924 #endif /* KMP_AFFINITY_SUPPORTED */
6925 
6926   KMP_ASSERT(__kmp_xproc > 0);
6927   if (__kmp_avail_proc == 0) {
6928     __kmp_avail_proc = __kmp_xproc;
6929   }
6930 
6931   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6932   // correct them now
6933   j = 0;
6934   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6935     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6936         __kmp_avail_proc;
6937     j++;
6938   }
6939 
6940   if (__kmp_dflt_team_nth == 0) {
6941 #ifdef KMP_DFLT_NTH_CORES
6942     // Default #threads = #cores
6943     __kmp_dflt_team_nth = __kmp_ncores;
6944     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6945                   "__kmp_ncores (%d)\n",
6946                   __kmp_dflt_team_nth));
6947 #else
6948     // Default #threads = #available OS procs
6949     __kmp_dflt_team_nth = __kmp_avail_proc;
6950     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6951                   "__kmp_avail_proc(%d)\n",
6952                   __kmp_dflt_team_nth));
6953 #endif /* KMP_DFLT_NTH_CORES */
6954   }
6955 
6956   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6957     __kmp_dflt_team_nth = KMP_MIN_NTH;
6958   }
6959   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6960     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6961   }
6962 
6963   // There's no harm in continuing if the following check fails,
6964   // but it indicates an error in the previous logic.
6965   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6966 
6967   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6968     // Run through the __kmp_threads array and set the num threads icv for each
6969     // root thread that is currently registered with the RTL (which has not
6970     // already explicitly set its nthreads-var with a call to
6971     // omp_set_num_threads()).
6972     for (i = 0; i < __kmp_threads_capacity; i++) {
6973       kmp_info_t *thread = __kmp_threads[i];
6974       if (thread == NULL)
6975         continue;
6976       if (thread->th.th_current_task->td_icvs.nproc != 0)
6977         continue;
6978 
6979       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6980     }
6981   }
6982   KA_TRACE(
6983       20,
6984       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6985        __kmp_dflt_team_nth));
6986 
6987 #ifdef KMP_ADJUST_BLOCKTIME
6988   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6989   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6990     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6991     if (__kmp_nth > __kmp_avail_proc) {
6992       __kmp_zero_bt = TRUE;
6993     }
6994   }
6995 #endif /* KMP_ADJUST_BLOCKTIME */
6996 
6997   /* we have finished middle initialization */
6998   TCW_SYNC_4(__kmp_init_middle, TRUE);
6999 
7000   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7001 }
7002 
7003 void __kmp_middle_initialize(void) {
7004   if (__kmp_init_middle) {
7005     return;
7006   }
7007   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7008   if (__kmp_init_middle) {
7009     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7010     return;
7011   }
7012   __kmp_do_middle_initialize();
7013   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7014 }
7015 
7016 void __kmp_parallel_initialize(void) {
7017   int gtid = __kmp_entry_gtid(); // this might be a new root
7018 
7019   /* synchronize parallel initialization (for sibling) */
7020   if (TCR_4(__kmp_init_parallel))
7021     return;
7022   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7023   if (TCR_4(__kmp_init_parallel)) {
7024     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7025     return;
7026   }
7027 
7028   /* TODO reinitialization after we have already shut down */
7029   if (TCR_4(__kmp_global.g.g_done)) {
7030     KA_TRACE(
7031         10,
7032         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7033     __kmp_infinite_loop();
7034   }
7035 
7036   /* jc: The lock __kmp_initz_lock is already held, so calling
7037      __kmp_serial_initialize would cause a deadlock.  So we call
7038      __kmp_do_serial_initialize directly. */
7039   if (!__kmp_init_middle) {
7040     __kmp_do_middle_initialize();
7041   }
7042   __kmp_resume_if_hard_paused();
7043 
7044   /* begin initialization */
7045   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7046   KMP_ASSERT(KMP_UBER_GTID(gtid));
7047 
7048 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7049   // Save the FP control regs.
7050   // Worker threads will set theirs to these values at thread startup.
7051   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7052   __kmp_store_mxcsr(&__kmp_init_mxcsr);
7053   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7054 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7055 
7056 #if KMP_OS_UNIX
7057 #if KMP_HANDLE_SIGNALS
7058   /*  must be after __kmp_serial_initialize  */
7059   __kmp_install_signals(TRUE);
7060 #endif
7061 #endif
7062 
7063   __kmp_suspend_initialize();
7064 
7065 #if defined(USE_LOAD_BALANCE)
7066   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7067     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7068   }
7069 #else
7070   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7071     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7072   }
7073 #endif
7074 
7075   if (__kmp_version) {
7076     __kmp_print_version_2();
7077   }
7078 
7079   /* we have finished parallel initialization */
7080   TCW_SYNC_4(__kmp_init_parallel, TRUE);
7081 
7082   KMP_MB();
7083   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7084 
7085   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7086 }
7087 
7088 /* ------------------------------------------------------------------------ */
7089 
7090 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7091                                    kmp_team_t *team) {
7092   kmp_disp_t *dispatch;
7093 
7094   KMP_MB();
7095 
7096   /* none of the threads have encountered any constructs, yet. */
7097   this_thr->th.th_local.this_construct = 0;
7098 #if KMP_CACHE_MANAGE
7099   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7100 #endif /* KMP_CACHE_MANAGE */
7101   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7102   KMP_DEBUG_ASSERT(dispatch);
7103   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7104   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7105   // this_thr->th.th_info.ds.ds_tid ] );
7106 
7107   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7108   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7109   if (__kmp_env_consistency_check)
7110     __kmp_push_parallel(gtid, team->t.t_ident);
7111 
7112   KMP_MB(); /* Flush all pending memory write invalidates.  */
7113 }
7114 
7115 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7116                                   kmp_team_t *team) {
7117   if (__kmp_env_consistency_check)
7118     __kmp_pop_parallel(gtid, team->t.t_ident);
7119 
7120   __kmp_finish_implicit_task(this_thr);
7121 }
7122 
7123 int __kmp_invoke_task_func(int gtid) {
7124   int rc;
7125   int tid = __kmp_tid_from_gtid(gtid);
7126   kmp_info_t *this_thr = __kmp_threads[gtid];
7127   kmp_team_t *team = this_thr->th.th_team;
7128 
7129   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7130 #if USE_ITT_BUILD
7131   if (__itt_stack_caller_create_ptr) {
7132     __kmp_itt_stack_callee_enter(
7133         (__itt_caller)
7134             team->t.t_stack_id); // inform ittnotify about entering user's code
7135   }
7136 #endif /* USE_ITT_BUILD */
7137 #if INCLUDE_SSC_MARKS
7138   SSC_MARK_INVOKING();
7139 #endif
7140 
7141 #if OMPT_SUPPORT
7142   void *dummy;
7143   void **exit_frame_p;
7144   ompt_data_t *my_task_data;
7145   ompt_data_t *my_parallel_data;
7146   int ompt_team_size;
7147 
7148   if (ompt_enabled.enabled) {
7149     exit_frame_p = &(
7150         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7151   } else {
7152     exit_frame_p = &dummy;
7153   }
7154 
7155   my_task_data =
7156       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7157   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7158   if (ompt_enabled.ompt_callback_implicit_task) {
7159     ompt_team_size = team->t.t_nproc;
7160     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7161         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7162         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7163     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7164   }
7165 #endif
7166 
7167 #if KMP_STATS_ENABLED
7168   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7169   if (previous_state == stats_state_e::TEAMS_REGION) {
7170     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7171   } else {
7172     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7173   }
7174   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7175 #endif
7176 
7177   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7178                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7179 #if OMPT_SUPPORT
7180                               ,
7181                               exit_frame_p
7182 #endif
7183                               );
7184 #if OMPT_SUPPORT
7185   *exit_frame_p = NULL;
7186    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7187 #endif
7188 
7189 #if KMP_STATS_ENABLED
7190   if (previous_state == stats_state_e::TEAMS_REGION) {
7191     KMP_SET_THREAD_STATE(previous_state);
7192   }
7193   KMP_POP_PARTITIONED_TIMER();
7194 #endif
7195 
7196 #if USE_ITT_BUILD
7197   if (__itt_stack_caller_create_ptr) {
7198     __kmp_itt_stack_callee_leave(
7199         (__itt_caller)
7200             team->t.t_stack_id); // inform ittnotify about leaving user's code
7201   }
7202 #endif /* USE_ITT_BUILD */
7203   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7204 
7205   return rc;
7206 }
7207 
7208 void __kmp_teams_master(int gtid) {
7209   // This routine is called by all master threads in teams construct
7210   kmp_info_t *thr = __kmp_threads[gtid];
7211   kmp_team_t *team = thr->th.th_team;
7212   ident_t *loc = team->t.t_ident;
7213   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7214   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7215   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7216   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7217                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7218 
7219   // This thread is a new CG root.  Set up the proper variables.
7220   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7221   tmp->cg_root = thr; // Make thr the CG root
7222   // Init to thread limit that was stored when league masters were forked
7223   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7224   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7225   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7226                  " cg_nthreads to 1\n",
7227                  thr, tmp));
7228   tmp->up = thr->th.th_cg_roots;
7229   thr->th.th_cg_roots = tmp;
7230 
7231 // Launch league of teams now, but not let workers execute
7232 // (they hang on fork barrier until next parallel)
7233 #if INCLUDE_SSC_MARKS
7234   SSC_MARK_FORKING();
7235 #endif
7236   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7237                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7238                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7239 #if INCLUDE_SSC_MARKS
7240   SSC_MARK_JOINING();
7241 #endif
7242   // If the team size was reduced from the limit, set it to the new size
7243   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7244     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7245   // AC: last parameter "1" eliminates join barrier which won't work because
7246   // worker threads are in a fork barrier waiting for more parallel regions
7247   __kmp_join_call(loc, gtid
7248 #if OMPT_SUPPORT
7249                   ,
7250                   fork_context_intel
7251 #endif
7252                   ,
7253                   1);
7254 }
7255 
7256 int __kmp_invoke_teams_master(int gtid) {
7257   kmp_info_t *this_thr = __kmp_threads[gtid];
7258   kmp_team_t *team = this_thr->th.th_team;
7259 #if KMP_DEBUG
7260   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7261     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7262                      (void *)__kmp_teams_master);
7263 #endif
7264   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7265 #if OMPT_SUPPORT
7266   int tid = __kmp_tid_from_gtid(gtid);
7267   ompt_data_t *task_data =
7268       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7269   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7270   if (ompt_enabled.ompt_callback_implicit_task) {
7271     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7272         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7273         ompt_task_initial);
7274     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7275   }
7276 #endif
7277   __kmp_teams_master(gtid);
7278 #if OMPT_SUPPORT
7279   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7280 #endif
7281   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7282   return 1;
7283 }
7284 
7285 /* this sets the requested number of threads for the next parallel region
7286    encountered by this team. since this should be enclosed in the forkjoin
7287    critical section it should avoid race conditions with asymmetrical nested
7288    parallelism */
7289 
7290 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7291   kmp_info_t *thr = __kmp_threads[gtid];
7292 
7293   if (num_threads > 0)
7294     thr->th.th_set_nproc = num_threads;
7295 }
7296 
7297 /* this sets the requested number of teams for the teams region and/or
7298    the number of threads for the next parallel region encountered  */
7299 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7300                           int num_threads) {
7301   kmp_info_t *thr = __kmp_threads[gtid];
7302   KMP_DEBUG_ASSERT(num_teams >= 0);
7303   KMP_DEBUG_ASSERT(num_threads >= 0);
7304 
7305   if (num_teams == 0)
7306     num_teams = 1; // default number of teams is 1.
7307   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7308     if (!__kmp_reserve_warn) {
7309       __kmp_reserve_warn = 1;
7310       __kmp_msg(kmp_ms_warning,
7311                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7312                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7313     }
7314     num_teams = __kmp_teams_max_nth;
7315   }
7316   // Set number of teams (number of threads in the outer "parallel" of the
7317   // teams)
7318   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7319 
7320   // Remember the number of threads for inner parallel regions
7321   if (!TCR_4(__kmp_init_middle))
7322     __kmp_middle_initialize(); // get internal globals calculated
7323   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7324   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7325   if (num_threads == 0) {
7326     num_threads = __kmp_avail_proc / num_teams;
7327     // adjust num_threads w/o warning as it is not user setting
7328     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7329     // no thread_limit clause specified -  do not change thread-limit-var ICV
7330     if (num_threads > __kmp_dflt_team_nth) {
7331       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7332     }
7333     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7334       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7335     } // prevent team size to exceed thread-limit-var
7336     if (num_teams * num_threads > __kmp_teams_max_nth) {
7337       num_threads = __kmp_teams_max_nth / num_teams;
7338     }
7339   } else {
7340     // This thread will be the master of the league masters
7341     // Store new thread limit; old limit is saved in th_cg_roots list
7342     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7343     // num_threads = min(num_threads, nthreads-var)
7344     if (num_threads > __kmp_dflt_team_nth) {
7345       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7346     }
7347     if (num_teams * num_threads > __kmp_teams_max_nth) {
7348       int new_threads = __kmp_teams_max_nth / num_teams;
7349       if (!__kmp_reserve_warn) { // user asked for too many threads
7350         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7351         __kmp_msg(kmp_ms_warning,
7352                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7353                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7354       }
7355       num_threads = new_threads;
7356     }
7357   }
7358   thr->th.th_teams_size.nth = num_threads;
7359 }
7360 
7361 // Set the proc_bind var to use in the following parallel region.
7362 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7363   kmp_info_t *thr = __kmp_threads[gtid];
7364   thr->th.th_set_proc_bind = proc_bind;
7365 }
7366 
7367 /* Launch the worker threads into the microtask. */
7368 
7369 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7370   kmp_info_t *this_thr = __kmp_threads[gtid];
7371 
7372 #ifdef KMP_DEBUG
7373   int f;
7374 #endif /* KMP_DEBUG */
7375 
7376   KMP_DEBUG_ASSERT(team);
7377   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7378   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7379   KMP_MB(); /* Flush all pending memory write invalidates.  */
7380 
7381   team->t.t_construct = 0; /* no single directives seen yet */
7382   team->t.t_ordered.dt.t_value =
7383       0; /* thread 0 enters the ordered section first */
7384 
7385   /* Reset the identifiers on the dispatch buffer */
7386   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7387   if (team->t.t_max_nproc > 1) {
7388     int i;
7389     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7390       team->t.t_disp_buffer[i].buffer_index = i;
7391       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7392     }
7393   } else {
7394     team->t.t_disp_buffer[0].buffer_index = 0;
7395     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7396   }
7397 
7398   KMP_MB(); /* Flush all pending memory write invalidates.  */
7399   KMP_ASSERT(this_thr->th.th_team == team);
7400 
7401 #ifdef KMP_DEBUG
7402   for (f = 0; f < team->t.t_nproc; f++) {
7403     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7404                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7405   }
7406 #endif /* KMP_DEBUG */
7407 
7408   /* release the worker threads so they may begin working */
7409   __kmp_fork_barrier(gtid, 0);
7410 }
7411 
7412 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7413   kmp_info_t *this_thr = __kmp_threads[gtid];
7414 
7415   KMP_DEBUG_ASSERT(team);
7416   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7417   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7418   KMP_MB(); /* Flush all pending memory write invalidates.  */
7419 
7420 /* Join barrier after fork */
7421 
7422 #ifdef KMP_DEBUG
7423   if (__kmp_threads[gtid] &&
7424       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7425     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7426                  __kmp_threads[gtid]);
7427     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7428                  "team->t.t_nproc=%d\n",
7429                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7430                  team->t.t_nproc);
7431     __kmp_print_structure();
7432   }
7433   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7434                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7435 #endif /* KMP_DEBUG */
7436 
7437   __kmp_join_barrier(gtid); /* wait for everyone */
7438 #if OMPT_SUPPORT
7439   if (ompt_enabled.enabled &&
7440       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7441     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7442     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7443     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7444 #if OMPT_OPTIONAL
7445     void *codeptr = NULL;
7446     if (KMP_MASTER_TID(ds_tid) &&
7447         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7448          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7449       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7450 
7451     if (ompt_enabled.ompt_callback_sync_region_wait) {
7452       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7453           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7454           codeptr);
7455     }
7456     if (ompt_enabled.ompt_callback_sync_region) {
7457       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7458           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7459           codeptr);
7460     }
7461 #endif
7462     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7463       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7464           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7465     }
7466   }
7467 #endif
7468 
7469   KMP_MB(); /* Flush all pending memory write invalidates.  */
7470   KMP_ASSERT(this_thr->th.th_team == team);
7471 }
7472 
7473 /* ------------------------------------------------------------------------ */
7474 
7475 #ifdef USE_LOAD_BALANCE
7476 
7477 // Return the worker threads actively spinning in the hot team, if we
7478 // are at the outermost level of parallelism.  Otherwise, return 0.
7479 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7480   int i;
7481   int retval;
7482   kmp_team_t *hot_team;
7483 
7484   if (root->r.r_active) {
7485     return 0;
7486   }
7487   hot_team = root->r.r_hot_team;
7488   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7489     return hot_team->t.t_nproc - 1; // Don't count master thread
7490   }
7491 
7492   // Skip the master thread - it is accounted for elsewhere.
7493   retval = 0;
7494   for (i = 1; i < hot_team->t.t_nproc; i++) {
7495     if (hot_team->t.t_threads[i]->th.th_active) {
7496       retval++;
7497     }
7498   }
7499   return retval;
7500 }
7501 
7502 // Perform an automatic adjustment to the number of
7503 // threads used by the next parallel region.
7504 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7505   int retval;
7506   int pool_active;
7507   int hot_team_active;
7508   int team_curr_active;
7509   int system_active;
7510 
7511   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7512                 set_nproc));
7513   KMP_DEBUG_ASSERT(root);
7514   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7515                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7516   KMP_DEBUG_ASSERT(set_nproc > 1);
7517 
7518   if (set_nproc == 1) {
7519     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7520     return 1;
7521   }
7522 
7523   // Threads that are active in the thread pool, active in the hot team for this
7524   // particular root (if we are at the outer par level), and the currently
7525   // executing thread (to become the master) are available to add to the new
7526   // team, but are currently contributing to the system load, and must be
7527   // accounted for.
7528   pool_active = __kmp_thread_pool_active_nth;
7529   hot_team_active = __kmp_active_hot_team_nproc(root);
7530   team_curr_active = pool_active + hot_team_active + 1;
7531 
7532   // Check the system load.
7533   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7534   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7535                 "hot team active = %d\n",
7536                 system_active, pool_active, hot_team_active));
7537 
7538   if (system_active < 0) {
7539     // There was an error reading the necessary info from /proc, so use the
7540     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7541     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7542     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7543     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7544 
7545     // Make this call behave like the thread limit algorithm.
7546     retval = __kmp_avail_proc - __kmp_nth +
7547              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7548     if (retval > set_nproc) {
7549       retval = set_nproc;
7550     }
7551     if (retval < KMP_MIN_NTH) {
7552       retval = KMP_MIN_NTH;
7553     }
7554 
7555     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7556                   retval));
7557     return retval;
7558   }
7559 
7560   // There is a slight delay in the load balance algorithm in detecting new
7561   // running procs. The real system load at this instant should be at least as
7562   // large as the #active omp thread that are available to add to the team.
7563   if (system_active < team_curr_active) {
7564     system_active = team_curr_active;
7565   }
7566   retval = __kmp_avail_proc - system_active + team_curr_active;
7567   if (retval > set_nproc) {
7568     retval = set_nproc;
7569   }
7570   if (retval < KMP_MIN_NTH) {
7571     retval = KMP_MIN_NTH;
7572   }
7573 
7574   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7575   return retval;
7576 } // __kmp_load_balance_nproc()
7577 
7578 #endif /* USE_LOAD_BALANCE */
7579 
7580 /* ------------------------------------------------------------------------ */
7581 
7582 /* NOTE: this is called with the __kmp_init_lock held */
7583 void __kmp_cleanup(void) {
7584   int f;
7585 
7586   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7587 
7588   if (TCR_4(__kmp_init_parallel)) {
7589 #if KMP_HANDLE_SIGNALS
7590     __kmp_remove_signals();
7591 #endif
7592     TCW_4(__kmp_init_parallel, FALSE);
7593   }
7594 
7595   if (TCR_4(__kmp_init_middle)) {
7596 #if KMP_AFFINITY_SUPPORTED
7597     __kmp_affinity_uninitialize();
7598 #endif /* KMP_AFFINITY_SUPPORTED */
7599     __kmp_cleanup_hierarchy();
7600     TCW_4(__kmp_init_middle, FALSE);
7601   }
7602 
7603   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7604 
7605   if (__kmp_init_serial) {
7606     __kmp_runtime_destroy();
7607     __kmp_init_serial = FALSE;
7608   }
7609 
7610   __kmp_cleanup_threadprivate_caches();
7611 
7612   for (f = 0; f < __kmp_threads_capacity; f++) {
7613     if (__kmp_root[f] != NULL) {
7614       __kmp_free(__kmp_root[f]);
7615       __kmp_root[f] = NULL;
7616     }
7617   }
7618   __kmp_free(__kmp_threads);
7619   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7620   // there is no need in freeing __kmp_root.
7621   __kmp_threads = NULL;
7622   __kmp_root = NULL;
7623   __kmp_threads_capacity = 0;
7624 
7625 #if KMP_USE_DYNAMIC_LOCK
7626   __kmp_cleanup_indirect_user_locks();
7627 #else
7628   __kmp_cleanup_user_locks();
7629 #endif
7630 
7631 #if KMP_AFFINITY_SUPPORTED
7632   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7633   __kmp_cpuinfo_file = NULL;
7634 #endif /* KMP_AFFINITY_SUPPORTED */
7635 
7636 #if KMP_USE_ADAPTIVE_LOCKS
7637 #if KMP_DEBUG_ADAPTIVE_LOCKS
7638   __kmp_print_speculative_stats();
7639 #endif
7640 #endif
7641   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7642   __kmp_nested_nth.nth = NULL;
7643   __kmp_nested_nth.size = 0;
7644   __kmp_nested_nth.used = 0;
7645   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7646   __kmp_nested_proc_bind.bind_types = NULL;
7647   __kmp_nested_proc_bind.size = 0;
7648   __kmp_nested_proc_bind.used = 0;
7649   if (__kmp_affinity_format) {
7650     KMP_INTERNAL_FREE(__kmp_affinity_format);
7651     __kmp_affinity_format = NULL;
7652   }
7653 
7654   __kmp_i18n_catclose();
7655 
7656 #if KMP_USE_HIER_SCHED
7657   __kmp_hier_scheds.deallocate();
7658 #endif
7659 
7660 #if KMP_STATS_ENABLED
7661   __kmp_stats_fini();
7662 #endif
7663 
7664   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7665 }
7666 
7667 /* ------------------------------------------------------------------------ */
7668 
7669 int __kmp_ignore_mppbeg(void) {
7670   char *env;
7671 
7672   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7673     if (__kmp_str_match_false(env))
7674       return FALSE;
7675   }
7676   // By default __kmpc_begin() is no-op.
7677   return TRUE;
7678 }
7679 
7680 int __kmp_ignore_mppend(void) {
7681   char *env;
7682 
7683   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7684     if (__kmp_str_match_false(env))
7685       return FALSE;
7686   }
7687   // By default __kmpc_end() is no-op.
7688   return TRUE;
7689 }
7690 
7691 void __kmp_internal_begin(void) {
7692   int gtid;
7693   kmp_root_t *root;
7694 
7695   /* this is a very important step as it will register new sibling threads
7696      and assign these new uber threads a new gtid */
7697   gtid = __kmp_entry_gtid();
7698   root = __kmp_threads[gtid]->th.th_root;
7699   KMP_ASSERT(KMP_UBER_GTID(gtid));
7700 
7701   if (root->r.r_begin)
7702     return;
7703   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7704   if (root->r.r_begin) {
7705     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7706     return;
7707   }
7708 
7709   root->r.r_begin = TRUE;
7710 
7711   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7712 }
7713 
7714 /* ------------------------------------------------------------------------ */
7715 
7716 void __kmp_user_set_library(enum library_type arg) {
7717   int gtid;
7718   kmp_root_t *root;
7719   kmp_info_t *thread;
7720 
7721   /* first, make sure we are initialized so we can get our gtid */
7722 
7723   gtid = __kmp_entry_gtid();
7724   thread = __kmp_threads[gtid];
7725 
7726   root = thread->th.th_root;
7727 
7728   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7729                 library_serial));
7730   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7731                                   thread */
7732     KMP_WARNING(SetLibraryIncorrectCall);
7733     return;
7734   }
7735 
7736   switch (arg) {
7737   case library_serial:
7738     thread->th.th_set_nproc = 0;
7739     set__nproc(thread, 1);
7740     break;
7741   case library_turnaround:
7742     thread->th.th_set_nproc = 0;
7743     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7744                                            : __kmp_dflt_team_nth_ub);
7745     break;
7746   case library_throughput:
7747     thread->th.th_set_nproc = 0;
7748     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7749                                            : __kmp_dflt_team_nth_ub);
7750     break;
7751   default:
7752     KMP_FATAL(UnknownLibraryType, arg);
7753   }
7754 
7755   __kmp_aux_set_library(arg);
7756 }
7757 
7758 void __kmp_aux_set_stacksize(size_t arg) {
7759   if (!__kmp_init_serial)
7760     __kmp_serial_initialize();
7761 
7762 #if KMP_OS_DARWIN
7763   if (arg & (0x1000 - 1)) {
7764     arg &= ~(0x1000 - 1);
7765     if (arg + 0x1000) /* check for overflow if we round up */
7766       arg += 0x1000;
7767   }
7768 #endif
7769   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7770 
7771   /* only change the default stacksize before the first parallel region */
7772   if (!TCR_4(__kmp_init_parallel)) {
7773     size_t value = arg; /* argument is in bytes */
7774 
7775     if (value < __kmp_sys_min_stksize)
7776       value = __kmp_sys_min_stksize;
7777     else if (value > KMP_MAX_STKSIZE)
7778       value = KMP_MAX_STKSIZE;
7779 
7780     __kmp_stksize = value;
7781 
7782     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7783   }
7784 
7785   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7786 }
7787 
7788 /* set the behaviour of the runtime library */
7789 /* TODO this can cause some odd behaviour with sibling parallelism... */
7790 void __kmp_aux_set_library(enum library_type arg) {
7791   __kmp_library = arg;
7792 
7793   switch (__kmp_library) {
7794   case library_serial: {
7795     KMP_INFORM(LibraryIsSerial);
7796   } break;
7797   case library_turnaround:
7798     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7799       __kmp_use_yield = 2; // only yield when oversubscribed
7800     break;
7801   case library_throughput:
7802     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7803       __kmp_dflt_blocktime = 200;
7804     break;
7805   default:
7806     KMP_FATAL(UnknownLibraryType, arg);
7807   }
7808 }
7809 
7810 /* Getting team information common for all team API */
7811 // Returns NULL if not in teams construct
7812 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7813   kmp_info_t *thr = __kmp_entry_thread();
7814   teams_serialized = 0;
7815   if (thr->th.th_teams_microtask) {
7816     kmp_team_t *team = thr->th.th_team;
7817     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7818     int ii = team->t.t_level;
7819     teams_serialized = team->t.t_serialized;
7820     int level = tlevel + 1;
7821     KMP_DEBUG_ASSERT(ii >= tlevel);
7822     while (ii > level) {
7823       for (teams_serialized = team->t.t_serialized;
7824            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7825       }
7826       if (team->t.t_serialized && (!teams_serialized)) {
7827         team = team->t.t_parent;
7828         continue;
7829       }
7830       if (ii > level) {
7831         team = team->t.t_parent;
7832         ii--;
7833       }
7834     }
7835     return team;
7836   }
7837   return NULL;
7838 }
7839 
7840 int __kmp_aux_get_team_num() {
7841   int serialized;
7842   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7843   if (team) {
7844     if (serialized > 1) {
7845       return 0; // teams region is serialized ( 1 team of 1 thread ).
7846     } else {
7847       return team->t.t_master_tid;
7848     }
7849   }
7850   return 0;
7851 }
7852 
7853 int __kmp_aux_get_num_teams() {
7854   int serialized;
7855   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7856   if (team) {
7857     if (serialized > 1) {
7858       return 1;
7859     } else {
7860       return team->t.t_parent->t.t_nproc;
7861     }
7862   }
7863   return 1;
7864 }
7865 
7866 /* ------------------------------------------------------------------------ */
7867 
7868 /*
7869  * Affinity Format Parser
7870  *
7871  * Field is in form of: %[[[0].]size]type
7872  * % and type are required (%% means print a literal '%')
7873  * type is either single char or long name surrounded by {},
7874  * e.g., N or {num_threads}
7875  * 0 => leading zeros
7876  * . => right justified when size is specified
7877  * by default output is left justified
7878  * size is the *minimum* field length
7879  * All other characters are printed as is
7880  *
7881  * Available field types:
7882  * L {thread_level}      - omp_get_level()
7883  * n {thread_num}        - omp_get_thread_num()
7884  * h {host}              - name of host machine
7885  * P {process_id}        - process id (integer)
7886  * T {thread_identifier} - native thread identifier (integer)
7887  * N {num_threads}       - omp_get_num_threads()
7888  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7889  * a {thread_affinity}   - comma separated list of integers or integer ranges
7890  *                         (values of affinity mask)
7891  *
7892  * Implementation-specific field types can be added
7893  * If a type is unknown, print "undefined"
7894 */
7895 
7896 // Structure holding the short name, long name, and corresponding data type
7897 // for snprintf.  A table of these will represent the entire valid keyword
7898 // field types.
7899 typedef struct kmp_affinity_format_field_t {
7900   char short_name; // from spec e.g., L -> thread level
7901   const char *long_name; // from spec thread_level -> thread level
7902   char field_format; // data type for snprintf (typically 'd' or 's'
7903   // for integer or string)
7904 } kmp_affinity_format_field_t;
7905 
7906 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7907 #if KMP_AFFINITY_SUPPORTED
7908     {'A', "thread_affinity", 's'},
7909 #endif
7910     {'t', "team_num", 'd'},
7911     {'T', "num_teams", 'd'},
7912     {'L', "nesting_level", 'd'},
7913     {'n', "thread_num", 'd'},
7914     {'N', "num_threads", 'd'},
7915     {'a', "ancestor_tnum", 'd'},
7916     {'H', "host", 's'},
7917     {'P', "process_id", 'd'},
7918     {'i', "native_thread_id", 'd'}};
7919 
7920 // Return the number of characters it takes to hold field
7921 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7922                                             const char **ptr,
7923                                             kmp_str_buf_t *field_buffer) {
7924   int rc, format_index, field_value;
7925   const char *width_left, *width_right;
7926   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7927   static const int FORMAT_SIZE = 20;
7928   char format[FORMAT_SIZE] = {0};
7929   char absolute_short_name = 0;
7930 
7931   KMP_DEBUG_ASSERT(gtid >= 0);
7932   KMP_DEBUG_ASSERT(th);
7933   KMP_DEBUG_ASSERT(**ptr == '%');
7934   KMP_DEBUG_ASSERT(field_buffer);
7935 
7936   __kmp_str_buf_clear(field_buffer);
7937 
7938   // Skip the initial %
7939   (*ptr)++;
7940 
7941   // Check for %% first
7942   if (**ptr == '%') {
7943     __kmp_str_buf_cat(field_buffer, "%", 1);
7944     (*ptr)++; // skip over the second %
7945     return 1;
7946   }
7947 
7948   // Parse field modifiers if they are present
7949   pad_zeros = false;
7950   if (**ptr == '0') {
7951     pad_zeros = true;
7952     (*ptr)++; // skip over 0
7953   }
7954   right_justify = false;
7955   if (**ptr == '.') {
7956     right_justify = true;
7957     (*ptr)++; // skip over .
7958   }
7959   // Parse width of field: [width_left, width_right)
7960   width_left = width_right = NULL;
7961   if (**ptr >= '0' && **ptr <= '9') {
7962     width_left = *ptr;
7963     SKIP_DIGITS(*ptr);
7964     width_right = *ptr;
7965   }
7966 
7967   // Create the format for KMP_SNPRINTF based on flags parsed above
7968   format_index = 0;
7969   format[format_index++] = '%';
7970   if (!right_justify)
7971     format[format_index++] = '-';
7972   if (pad_zeros)
7973     format[format_index++] = '0';
7974   if (width_left && width_right) {
7975     int i = 0;
7976     // Only allow 8 digit number widths.
7977     // This also prevents overflowing format variable
7978     while (i < 8 && width_left < width_right) {
7979       format[format_index++] = *width_left;
7980       width_left++;
7981       i++;
7982     }
7983   }
7984 
7985   // Parse a name (long or short)
7986   // Canonicalize the name into absolute_short_name
7987   found_valid_name = false;
7988   parse_long_name = (**ptr == '{');
7989   if (parse_long_name)
7990     (*ptr)++; // skip initial left brace
7991   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7992                              sizeof(__kmp_affinity_format_table[0]);
7993        ++i) {
7994     char short_name = __kmp_affinity_format_table[i].short_name;
7995     const char *long_name = __kmp_affinity_format_table[i].long_name;
7996     char field_format = __kmp_affinity_format_table[i].field_format;
7997     if (parse_long_name) {
7998       int length = KMP_STRLEN(long_name);
7999       if (strncmp(*ptr, long_name, length) == 0) {
8000         found_valid_name = true;
8001         (*ptr) += length; // skip the long name
8002       }
8003     } else if (**ptr == short_name) {
8004       found_valid_name = true;
8005       (*ptr)++; // skip the short name
8006     }
8007     if (found_valid_name) {
8008       format[format_index++] = field_format;
8009       format[format_index++] = '\0';
8010       absolute_short_name = short_name;
8011       break;
8012     }
8013   }
8014   if (parse_long_name) {
8015     if (**ptr != '}') {
8016       absolute_short_name = 0;
8017     } else {
8018       (*ptr)++; // skip over the right brace
8019     }
8020   }
8021 
8022   // Attempt to fill the buffer with the requested
8023   // value using snprintf within __kmp_str_buf_print()
8024   switch (absolute_short_name) {
8025   case 't':
8026     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8027     break;
8028   case 'T':
8029     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8030     break;
8031   case 'L':
8032     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8033     break;
8034   case 'n':
8035     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8036     break;
8037   case 'H': {
8038     static const int BUFFER_SIZE = 256;
8039     char buf[BUFFER_SIZE];
8040     __kmp_expand_host_name(buf, BUFFER_SIZE);
8041     rc = __kmp_str_buf_print(field_buffer, format, buf);
8042   } break;
8043   case 'P':
8044     rc = __kmp_str_buf_print(field_buffer, format, getpid());
8045     break;
8046   case 'i':
8047     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8048     break;
8049   case 'N':
8050     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8051     break;
8052   case 'a':
8053     field_value =
8054         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8055     rc = __kmp_str_buf_print(field_buffer, format, field_value);
8056     break;
8057 #if KMP_AFFINITY_SUPPORTED
8058   case 'A': {
8059     kmp_str_buf_t buf;
8060     __kmp_str_buf_init(&buf);
8061     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8062     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8063     __kmp_str_buf_free(&buf);
8064   } break;
8065 #endif
8066   default:
8067     // According to spec, If an implementation does not have info for field
8068     // type, then "undefined" is printed
8069     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8070     // Skip the field
8071     if (parse_long_name) {
8072       SKIP_TOKEN(*ptr);
8073       if (**ptr == '}')
8074         (*ptr)++;
8075     } else {
8076       (*ptr)++;
8077     }
8078   }
8079 
8080   KMP_ASSERT(format_index <= FORMAT_SIZE);
8081   return rc;
8082 }
8083 
8084 /*
8085  * Return number of characters needed to hold the affinity string
8086  * (not including null byte character)
8087  * The resultant string is printed to buffer, which the caller can then
8088  * handle afterwards
8089 */
8090 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8091                                   kmp_str_buf_t *buffer) {
8092   const char *parse_ptr;
8093   size_t retval;
8094   const kmp_info_t *th;
8095   kmp_str_buf_t field;
8096 
8097   KMP_DEBUG_ASSERT(buffer);
8098   KMP_DEBUG_ASSERT(gtid >= 0);
8099 
8100   __kmp_str_buf_init(&field);
8101   __kmp_str_buf_clear(buffer);
8102 
8103   th = __kmp_threads[gtid];
8104   retval = 0;
8105 
8106   // If format is NULL or zero-length string, then we use
8107   // affinity-format-var ICV
8108   parse_ptr = format;
8109   if (parse_ptr == NULL || *parse_ptr == '\0') {
8110     parse_ptr = __kmp_affinity_format;
8111   }
8112   KMP_DEBUG_ASSERT(parse_ptr);
8113 
8114   while (*parse_ptr != '\0') {
8115     // Parse a field
8116     if (*parse_ptr == '%') {
8117       // Put field in the buffer
8118       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8119       __kmp_str_buf_catbuf(buffer, &field);
8120       retval += rc;
8121     } else {
8122       // Put literal character in buffer
8123       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8124       retval++;
8125       parse_ptr++;
8126     }
8127   }
8128   __kmp_str_buf_free(&field);
8129   return retval;
8130 }
8131 
8132 // Displays the affinity string to stdout
8133 void __kmp_aux_display_affinity(int gtid, const char *format) {
8134   kmp_str_buf_t buf;
8135   __kmp_str_buf_init(&buf);
8136   __kmp_aux_capture_affinity(gtid, format, &buf);
8137   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8138   __kmp_str_buf_free(&buf);
8139 }
8140 
8141 /* ------------------------------------------------------------------------ */
8142 
8143 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8144   int blocktime = arg; /* argument is in milliseconds */
8145 #if KMP_USE_MONITOR
8146   int bt_intervals;
8147 #endif
8148   int bt_set;
8149 
8150   __kmp_save_internal_controls(thread);
8151 
8152   /* Normalize and set blocktime for the teams */
8153   if (blocktime < KMP_MIN_BLOCKTIME)
8154     blocktime = KMP_MIN_BLOCKTIME;
8155   else if (blocktime > KMP_MAX_BLOCKTIME)
8156     blocktime = KMP_MAX_BLOCKTIME;
8157 
8158   set__blocktime_team(thread->th.th_team, tid, blocktime);
8159   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8160 
8161 #if KMP_USE_MONITOR
8162   /* Calculate and set blocktime intervals for the teams */
8163   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8164 
8165   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8166   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8167 #endif
8168 
8169   /* Set whether blocktime has been set to "TRUE" */
8170   bt_set = TRUE;
8171 
8172   set__bt_set_team(thread->th.th_team, tid, bt_set);
8173   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8174 #if KMP_USE_MONITOR
8175   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8176                 "bt_intervals=%d, monitor_updates=%d\n",
8177                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8178                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8179                 __kmp_monitor_wakeups));
8180 #else
8181   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8182                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8183                 thread->th.th_team->t.t_id, tid, blocktime));
8184 #endif
8185 }
8186 
8187 void __kmp_aux_set_defaults(char const *str, int len) {
8188   if (!__kmp_init_serial) {
8189     __kmp_serial_initialize();
8190   }
8191   __kmp_env_initialize(str);
8192 
8193   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8194     __kmp_env_print();
8195   }
8196 } // __kmp_aux_set_defaults
8197 
8198 /* ------------------------------------------------------------------------ */
8199 /* internal fast reduction routines */
8200 
8201 PACKED_REDUCTION_METHOD_T
8202 __kmp_determine_reduction_method(
8203     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8204     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8205     kmp_critical_name *lck) {
8206 
8207   // Default reduction method: critical construct ( lck != NULL, like in current
8208   // PAROPT )
8209   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8210   // can be selected by RTL
8211   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8212   // can be selected by RTL
8213   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8214   // among generated by PAROPT.
8215 
8216   PACKED_REDUCTION_METHOD_T retval;
8217 
8218   int team_size;
8219 
8220   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8221   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8222 
8223 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8224   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8225 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8226 
8227   retval = critical_reduce_block;
8228 
8229   // another choice of getting a team size (with 1 dynamic deference) is slower
8230   team_size = __kmp_get_team_num_threads(global_tid);
8231   if (team_size == 1) {
8232 
8233     retval = empty_reduce_block;
8234 
8235   } else {
8236 
8237     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8238 
8239 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8240     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8241 
8242 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8243     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8244 
8245     int teamsize_cutoff = 4;
8246 
8247 #if KMP_MIC_SUPPORTED
8248     if (__kmp_mic_type != non_mic) {
8249       teamsize_cutoff = 8;
8250     }
8251 #endif
8252     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8253     if (tree_available) {
8254       if (team_size <= teamsize_cutoff) {
8255         if (atomic_available) {
8256           retval = atomic_reduce_block;
8257         }
8258       } else {
8259         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8260       }
8261     } else if (atomic_available) {
8262       retval = atomic_reduce_block;
8263     }
8264 #else
8265 #error "Unknown or unsupported OS"
8266 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8267        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8268 
8269 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8270 
8271 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8272 
8273     // basic tuning
8274 
8275     if (atomic_available) {
8276       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8277         retval = atomic_reduce_block;
8278       }
8279     } // otherwise: use critical section
8280 
8281 #elif KMP_OS_DARWIN
8282 
8283     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8284     if (atomic_available && (num_vars <= 3)) {
8285       retval = atomic_reduce_block;
8286     } else if (tree_available) {
8287       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8288           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8289         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8290       }
8291     } // otherwise: use critical section
8292 
8293 #else
8294 #error "Unknown or unsupported OS"
8295 #endif
8296 
8297 #else
8298 #error "Unknown or unsupported architecture"
8299 #endif
8300   }
8301 
8302   // KMP_FORCE_REDUCTION
8303 
8304   // If the team is serialized (team_size == 1), ignore the forced reduction
8305   // method and stay with the unsynchronized method (empty_reduce_block)
8306   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8307       team_size != 1) {
8308 
8309     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8310 
8311     int atomic_available, tree_available;
8312 
8313     switch ((forced_retval = __kmp_force_reduction_method)) {
8314     case critical_reduce_block:
8315       KMP_ASSERT(lck); // lck should be != 0
8316       break;
8317 
8318     case atomic_reduce_block:
8319       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8320       if (!atomic_available) {
8321         KMP_WARNING(RedMethodNotSupported, "atomic");
8322         forced_retval = critical_reduce_block;
8323       }
8324       break;
8325 
8326     case tree_reduce_block:
8327       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8328       if (!tree_available) {
8329         KMP_WARNING(RedMethodNotSupported, "tree");
8330         forced_retval = critical_reduce_block;
8331       } else {
8332 #if KMP_FAST_REDUCTION_BARRIER
8333         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8334 #endif
8335       }
8336       break;
8337 
8338     default:
8339       KMP_ASSERT(0); // "unsupported method specified"
8340     }
8341 
8342     retval = forced_retval;
8343   }
8344 
8345   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8346 
8347 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8348 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8349 
8350   return (retval);
8351 }
8352 // this function is for testing set/get/determine reduce method
8353 kmp_int32 __kmp_get_reduce_method(void) {
8354   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8355 }
8356 
8357 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8358 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8359 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8360 
8361 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8362 // OpenMP is used subsequently.
8363 void __kmp_hard_pause() {
8364   __kmp_pause_status = kmp_hard_paused;
8365   __kmp_internal_end_thread(-1);
8366 }
8367 
8368 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8369 void __kmp_resume_if_soft_paused() {
8370   if (__kmp_pause_status == kmp_soft_paused) {
8371     __kmp_pause_status = kmp_not_paused;
8372 
8373     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8374       kmp_info_t *thread = __kmp_threads[gtid];
8375       if (thread) { // Wake it if sleeping
8376         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8377         if (fl.is_sleeping())
8378           fl.resume(gtid);
8379         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8380           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8381         } else { // thread holds the lock and may sleep soon
8382           do { // until either the thread sleeps, or we can get the lock
8383             if (fl.is_sleeping()) {
8384               fl.resume(gtid);
8385               break;
8386             } else if (__kmp_try_suspend_mx(thread)) {
8387               __kmp_unlock_suspend_mx(thread);
8388               break;
8389             }
8390           } while (1);
8391         }
8392       }
8393     }
8394   }
8395 }
8396 
8397 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8398 // TODO: add warning messages
8399 int __kmp_pause_resource(kmp_pause_status_t level) {
8400   if (level == kmp_not_paused) { // requesting resume
8401     if (__kmp_pause_status == kmp_not_paused) {
8402       // error message about runtime not being paused, so can't resume
8403       return 1;
8404     } else {
8405       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8406                        __kmp_pause_status == kmp_hard_paused);
8407       __kmp_pause_status = kmp_not_paused;
8408       return 0;
8409     }
8410   } else if (level == kmp_soft_paused) { // requesting soft pause
8411     if (__kmp_pause_status != kmp_not_paused) {
8412       // error message about already being paused
8413       return 1;
8414     } else {
8415       __kmp_soft_pause();
8416       return 0;
8417     }
8418   } else if (level == kmp_hard_paused) { // requesting hard pause
8419     if (__kmp_pause_status != kmp_not_paused) {
8420       // error message about already being paused
8421       return 1;
8422     } else {
8423       __kmp_hard_pause();
8424       return 0;
8425     }
8426   } else {
8427     // error message about invalid level
8428     return 1;
8429   }
8430 }
8431 
8432 
8433 void __kmp_omp_display_env(int verbose) {
8434   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8435   if (__kmp_init_serial == 0)
8436     __kmp_do_serial_initialize();
8437   __kmp_display_env_impl(!verbose, verbose);
8438   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8439 }
8440