1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46     KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] =
50     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54     KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58 
59 /* ------------------------------------------------------------------------ */
60 
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64 
65 /* Forward declarations */
66 
67 void __kmp_cleanup(void);
68 
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                   int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                   kmp_internal_control_t *new_icvs,
73                                   ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76                                    int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                           kmp_internal_control_t *new_icvs, ident_t *loc);
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87 
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
100   int i;
101   kmp_info_t **other_threads;
102   size_t stack_data;
103   char *stack_addr;
104   size_t stack_size;
105   char *stack_base;
106 
107   KA_TRACE(
108       1000,
109       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110        __kmp_nth, __kmp_all_nth));
111 
112   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115      __kmp_init_gtid for this to work. */
116 
117   if (!TCR_4(__kmp_init_gtid))
118     return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121   if (TCR_4(__kmp_gtid_mode) >= 3) {
122     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123     return __kmp_gtid;
124   }
125 #endif
126   if (TCR_4(__kmp_gtid_mode) >= 2) {
127     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128     return __kmp_gtid_get_specific();
129   }
130   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131 
132   stack_addr = (char *)&stack_data;
133   other_threads = __kmp_threads;
134 
135   /* ATT: The code below is a source of potential bugs due to unsynchronized
136      access to __kmp_threads array. For example:
137      1. Current thread loads other_threads[i] to thr and checks it, it is
138         non-NULL.
139      2. Current thread is suspended by OS.
140      3. Another thread unregisters and finishes (debug versions of free()
141         may fill memory with something like 0xEF).
142      4. Current thread is resumed.
143      5. Current thread reads junk from *thr.
144      TODO: Fix it.  --ln  */
145 
146   for (i = 0; i < __kmp_threads_capacity; i++) {
147 
148     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149     if (!thr)
150       continue;
151 
152     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154 
155     /* stack grows down -- search through all of the active threads */
156 
157     if (stack_addr <= stack_base) {
158       size_t stack_diff = stack_base - stack_addr;
159 
160       if (stack_diff <= stack_size) {
161         /* The only way we can be closer than the allocated */
162         /* stack size is if we are running on this thread. */
163         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164         return i;
165       }
166     }
167   }
168 
169   /* get specific to try and determine our gtid */
170   KA_TRACE(1000,
171            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172             "thread, using TLS\n"));
173   i = __kmp_gtid_get_specific();
174 
175   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176 
177   /* if we havn't been assigned a gtid, then return code */
178   if (i < 0)
179     return i;
180 
181   /* dynamically updated stack window for uber threads to avoid get_specific
182      call */
183   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184     KMP_FATAL(StackOverflow, i);
185   }
186 
187   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188   if (stack_addr > stack_base) {
189     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                 stack_base);
193   } else {
194     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195             stack_base - stack_addr);
196   }
197 
198   /* Reprint stack bounds for ubermaster since they have been refined */
199   if (__kmp_storage_map) {
200     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                  other_threads[i]->th.th_info.ds.ds_stacksize,
204                                  "th_%d stack (refinement)", i);
205   }
206   return i;
207 }
208 
209 int __kmp_get_global_thread_id_reg() {
210   int gtid;
211 
212   if (!__kmp_init_serial) {
213     gtid = KMP_GTID_DNE;
214   } else
215 #ifdef KMP_TDATA_GTID
216       if (TCR_4(__kmp_gtid_mode) >= 3) {
217     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218     gtid = __kmp_gtid;
219   } else
220 #endif
221       if (TCR_4(__kmp_gtid_mode) >= 2) {
222     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223     gtid = __kmp_gtid_get_specific();
224   } else {
225     KA_TRACE(1000,
226              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227     gtid = __kmp_get_global_thread_id();
228   }
229 
230   /* we must be a new uber master sibling thread */
231   if (gtid == KMP_GTID_DNE) {
232     KA_TRACE(10,
233              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234               "Registering a new gtid.\n"));
235     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236     if (!__kmp_init_serial) {
237       __kmp_do_serial_initialize();
238       gtid = __kmp_gtid_get_specific();
239     } else {
240       gtid = __kmp_register_root(FALSE);
241     }
242     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244   }
245 
246   KMP_DEBUG_ASSERT(gtid >= 0);
247 
248   return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253   int f;
254   char *stack_beg = NULL;
255   char *stack_end = NULL;
256   int gtid;
257 
258   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259   if (__kmp_storage_map) {
260     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263     gtid = __kmp_gtid_from_thread(th);
264 
265     if (gtid == KMP_GTID_MONITOR) {
266       __kmp_print_storage_map_gtid(
267           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268           "th_%s stack (%s)", "mon",
269           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270     } else {
271       __kmp_print_storage_map_gtid(
272           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273           "th_%d stack (%s)", gtid,
274           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275     }
276   }
277 
278   /* No point in checking ubermaster threads since they use refinement and
279    * cannot overlap */
280   gtid = __kmp_gtid_from_thread(th);
281   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282     KA_TRACE(10,
283              ("__kmp_check_stack_overlap: performing extensive checking\n"));
284     if (stack_beg == NULL) {
285       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287     }
288 
289     for (f = 0; f < __kmp_threads_capacity; f++) {
290       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291 
292       if (f_th && f_th != th) {
293         char *other_stack_end =
294             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295         char *other_stack_beg =
296             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300           /* Print the other stack values before the abort */
301           if (__kmp_storage_map)
302             __kmp_print_storage_map_gtid(
303                 -1, other_stack_beg, other_stack_end,
304                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306 
307           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                       __kmp_msg_null);
309         }
310       }
311     }
312   }
313   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void __kmp_infinite_loop(void) {
319   static int done = FALSE;
320 
321   while (!done) {
322     KMP_YIELD(TRUE);
323   }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                   char const *format, ...) {
330   char buffer[MAX_MESSAGE];
331   va_list ap;
332 
333   va_start(ap, format);
334   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335                p2, (unsigned long)size, format);
336   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337   __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339   int node;
340   if (gtid >= 0) {
341     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342       if (__kmp_storage_map_verbose) {
343         node = __kmp_get_host_node(p1);
344         if (node < 0) /* doesn't work, so don't try this next time */
345           __kmp_storage_map_verbose = FALSE;
346         else {
347           char *last;
348           int lastNode;
349           int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351           const int page_size = KMP_GET_PAGE_SIZE();
352 
353           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355           if (localProc >= 0)
356             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                  localProc >> 1);
358           else
359             __kmp_printf_no_lock("  GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361           /* The more elaborate format is disabled for now because of the prctl
362            * hanging bug. */
363           do {
364             last = p1;
365             lastNode = node;
366             /* This loop collates adjacent pages with the same host node. */
367             do {
368               (char *)p1 += page_size;
369             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                  lastNode);
372           } while (p1 <= p2);
373 #else
374           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                                (char *)p1 + (page_size - 1),
376                                __kmp_get_host_node(p1));
377           if (p1 < p2) {
378             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                  (char *)p2 + (page_size - 1),
380                                  __kmp_get_host_node(p2));
381           }
382 #endif
383         }
384       }
385     } else
386       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387   }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391 
392 void __kmp_warn(char const *format, ...) {
393   char buffer[MAX_MESSAGE];
394   va_list ap;
395 
396   if (__kmp_generate_warnings == kmp_warnings_off) {
397     return;
398   }
399 
400   va_start(ap, format);
401 
402   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404   __kmp_vprintf(kmp_err, buffer, ap);
405   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 
407   va_end(ap);
408 }
409 
410 void __kmp_abort_process() {
411   // Later threads may stall here, but that's ok because abort() will kill them.
412   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413 
414   if (__kmp_debug_buf) {
415     __kmp_dump_debug_buffer();
416   }
417 
418   if (KMP_OS_WINDOWS) {
419     // Let other threads know of abnormal termination and prevent deadlock
420     // if abort happened during library initialization or shutdown
421     __kmp_global.g.g_abort = SIGABRT;
422 
423     /* On Windows* OS by default abort() causes pop-up error box, which stalls
424        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425        boxes. _set_abort_behavior() works well, but this function is not
426        available in VS7 (this is not problem for DLL, but it is a problem for
427        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428        help, at least in some versions of MS C RTL.
429 
430        It seems following sequence is the only way to simulate abort() and
431        avoid pop-up error box. */
432     raise(SIGABRT);
433     _exit(3); // Just in case, if signal ignored, exit anyway.
434   } else {
435     abort();
436   }
437 
438   __kmp_infinite_loop();
439   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440 
441 } // __kmp_abort_process
442 
443 void __kmp_abort_thread(void) {
444   // TODO: Eliminate g_abort global variable and this function.
445   // In case of abort just call abort(), it will kill all the threads.
446   __kmp_infinite_loop();
447 } // __kmp_abort_thread
448 
449 /* Print out the storage map for the major kmp_info_t thread data structures
450    that are allocated together. */
451 
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                                gtid);
455 
456   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
461 
462   __kmp_print_storage_map_gtid(
463       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465 
466   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                                &thr->th.th_bar[bs_plain_barrier + 1],
468                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                                gtid);
470 
471   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                                gtid);
475 
476 #if KMP_FAST_REDUCTION_BARRIER
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                                &thr->th.th_bar[bs_reduction_barrier + 1],
479                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                                gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /* Print out the storage map for the major kmp_team_t team data structures
485    that are allocated together. */
486 
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                          int team_id, int num_thr) {
489   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                                header, team_id);
492 
493   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                                &team->t.t_bar[bs_last_barrier],
495                                sizeof(kmp_balign_team_t) * bs_last_barrier,
496                                "%s_%d.t_bar", header, team_id);
497 
498   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                                &team->t.t_bar[bs_plain_barrier + 1],
500                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                                header, team_id);
502 
503   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                                &team->t.t_bar[bs_forkjoin_barrier + 1],
505                                sizeof(kmp_balign_team_t),
506                                "%s_%d.t_bar[forkjoin]", header, team_id);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                                &team->t.t_bar[bs_reduction_barrier + 1],
511                                sizeof(kmp_balign_team_t),
512                                "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 
515   __kmp_print_storage_map_gtid(
516       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518 
519   __kmp_print_storage_map_gtid(
520       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522 
523   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                                &team->t.t_disp_buffer[num_disp_buff],
525                                sizeof(dispatch_shared_info_t) * num_disp_buff,
526                                "%s_%d.t_disp_buffer", header, team_id);
527 }
528 
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531 
532 /* ------------------------------------------------------------------------ */
533 
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536 
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538   // TODO: Change to __kmp_break_bootstrap_lock().
539   __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541 
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543   int i;
544   int thread_count;
545 
546   // PROCESS_DETACH is expected to be called by a thread that executes
547   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550   // threads can be still alive here, although being about to be terminated. The
551   // threads in the array with ds_thread==0 are most suspicious. Actually, it
552   // can be not safe to access the __kmp_threads[].
553 
554   // TODO: does it make sense to check __kmp_roots[] ?
555 
556   // Let's check that there are no other alive threads registered with the OMP
557   // lib.
558   while (1) {
559     thread_count = 0;
560     for (i = 0; i < __kmp_threads_capacity; ++i) {
561       if (!__kmp_threads)
562         continue;
563       kmp_info_t *th = __kmp_threads[i];
564       if (th == NULL)
565         continue;
566       int gtid = th->th.th_info.ds.ds_gtid;
567       if (gtid == gtid_req)
568         continue;
569       if (gtid < 0)
570         continue;
571       DWORD exit_val;
572       int alive = __kmp_is_thread_alive(th, &exit_val);
573       if (alive) {
574         ++thread_count;
575       }
576     }
577     if (thread_count == 0)
578       break; // success
579   }
580 
581   // Assume that I'm alone. Now it might be safe to check and reset locks.
582   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583   __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585   __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588 
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591 
592   switch (fdwReason) {
593 
594   case DLL_PROCESS_ATTACH:
595     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596 
597     return TRUE;
598 
599   case DLL_PROCESS_DETACH:
600     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601 
602     if (lpReserved != NULL) {
603       // lpReserved is used for telling the difference:
604       //   lpReserved == NULL when FreeLibrary() was called,
605       //   lpReserved != NULL when the process terminates.
606       // When FreeLibrary() is called, worker threads remain alive. So they will
607       // release the forkjoin lock by themselves. When the process terminates,
608       // worker threads disappear triggering the problem of unreleased forkjoin
609       // lock as described below.
610 
611       // A worker thread can take the forkjoin lock. The problem comes up if
612       // that worker thread becomes dead before it releases the forkjoin lock.
613       // The forkjoin lock remains taken, while the thread executing
614       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615       // to take the forkjoin lock and will always fail, so that the application
616       // will never finish [normally]. This scenario is possible if
617       // __kmpc_end() has not been executed. It looks like it's not a corner
618       // case, but common cases:
619       // - the main function was compiled by an alternative compiler;
620       // - the main function was compiled by icl but without /Qopenmp
621       //   (application with plugins);
622       // - application terminates by calling C exit(), Fortran CALL EXIT() or
623       //   Fortran STOP.
624       // - alive foreign thread prevented __kmpc_end from doing cleanup.
625       //
626       // This is a hack to work around the problem.
627       // TODO: !!! figure out something better.
628       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629     }
630 
631     __kmp_internal_end_library(__kmp_gtid_get_specific());
632 
633     return TRUE;
634 
635   case DLL_THREAD_ATTACH:
636     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637 
638     /* if we want to register new siblings all the time here call
639      * __kmp_get_gtid(); */
640     return TRUE;
641 
642   case DLL_THREAD_DETACH:
643     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644 
645     __kmp_internal_end_thread(__kmp_gtid_get_specific());
646     return TRUE;
647   }
648 
649   return TRUE;
650 }
651 
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654 
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657   int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659   kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662   if (__kmp_env_consistency_check) {
663     if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669   }
670 #ifdef BUILD_PARALLEL_ORDERED
671   if (!team->t.t_serialized) {
672     KMP_MB();
673     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674              NULL);
675     KMP_MB();
676   }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679 
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   int tid = __kmp_tid_from_gtid(gtid);
685   kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687 
688   if (__kmp_env_consistency_check) {
689     if (__kmp_threads[gtid]->th.th_root->r.r_active)
690       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691   }
692 #ifdef BUILD_PARALLEL_ORDERED
693   if (!team->t.t_serialized) {
694     KMP_MB(); /* Flush all pending memory write invalidates.  */
695 
696     /* use the tid of the next thread in this team */
697     /* TODO replace with general release procedure */
698     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699 
700     KMP_MB(); /* Flush all pending memory write invalidates.  */
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit   */
707 
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709   int status;
710   kmp_info_t *th;
711   kmp_team_t *team;
712 
713   if (!TCR_4(__kmp_init_parallel))
714     __kmp_parallel_initialize();
715   __kmp_resume_if_soft_paused();
716 
717   th = __kmp_threads[gtid];
718   team = th->th.th_team;
719   status = 0;
720 
721   th->th.th_ident = id_ref;
722 
723   if (team->t.t_serialized) {
724     status = 1;
725   } else {
726     kmp_int32 old_this = th->th.th_local.this_construct;
727 
728     ++th->th.th_local.this_construct;
729     /* try to set team count to thread count--success means thread got the
730        single block */
731     /* TODO: Should this be acquire or release? */
732     if (team->t.t_construct == old_this) {
733       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                               th->th.th_local.this_construct);
735     }
736 #if USE_ITT_BUILD
737     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739         team->t.t_active_level ==
740             1) { // Only report metadata by master of active team at level 1
741       __kmp_itt_metadata_single(id_ref);
742     }
743 #endif /* USE_ITT_BUILD */
744   }
745 
746   if (__kmp_env_consistency_check) {
747     if (status && push_ws) {
748       __kmp_push_workshare(gtid, ct_psingle, id_ref);
749     } else {
750       __kmp_check_workshare(gtid, ct_psingle, id_ref);
751     }
752   }
753 #if USE_ITT_BUILD
754   if (status) {
755     __kmp_itt_single_start(gtid);
756   }
757 #endif /* USE_ITT_BUILD */
758   return status;
759 }
760 
761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763   __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765   if (__kmp_env_consistency_check)
766     __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768 
769 /* determine if we can go parallel or must use a serialized parallel region and
770  * how many threads we can use
771  * set_nproc is the number of threads requested for the team
772  * returns 0 if we should serialize or only use one thread,
773  * otherwise the number of threads to use
774  * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                  int master_tid, int set_nthreads,
777                                  int enter_teams) {
778   int capacity;
779   int new_nthreads;
780   KMP_DEBUG_ASSERT(__kmp_init_serial);
781   KMP_DEBUG_ASSERT(root && parent_team);
782   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783 
784   // If dyn-var is set, dynamically adjust the number of desired threads,
785   // according to the method specified by dynamic_mode.
786   new_nthreads = set_nthreads;
787   if (!get__dynamic_2(parent_team, master_tid)) {
788     ;
789   }
790 #ifdef USE_LOAD_BALANCE
791   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793     if (new_nthreads == 1) {
794       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                     "reservation to 1 thread\n",
796                     master_tid));
797       return 1;
798     }
799     if (new_nthreads < set_nthreads) {
800       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                     "reservation to %d threads\n",
802                     master_tid, new_nthreads));
803     }
804   }
805 #endif /* USE_LOAD_BALANCE */
806   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807     new_nthreads = __kmp_avail_proc - __kmp_nth +
808                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809     if (new_nthreads <= 1) {
810       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                     "reservation to 1 thread\n",
812                     master_tid));
813       return 1;
814     }
815     if (new_nthreads < set_nthreads) {
816       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                     "reservation to %d threads\n",
818                     master_tid, new_nthreads));
819     } else {
820       new_nthreads = set_nthreads;
821     }
822   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823     if (set_nthreads > 2) {
824       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825       new_nthreads = (new_nthreads % set_nthreads) + 1;
826       if (new_nthreads == 1) {
827         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                       "reservation to 1 thread\n",
829                       master_tid));
830         return 1;
831       }
832       if (new_nthreads < set_nthreads) {
833         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                       "reservation to %d threads\n",
835                       master_tid, new_nthreads));
836       }
837     }
838   } else {
839     KMP_ASSERT(0);
840   }
841 
842   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843   if (__kmp_nth + new_nthreads -
844           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845       __kmp_max_nth) {
846     int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848     if (tl_nthreads <= 0) {
849       tl_nthreads = 1;
850     }
851 
852     // If dyn-var is false, emit a 1-time warning.
853     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854       __kmp_reserve_warn = 1;
855       __kmp_msg(kmp_ms_warning,
856                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858     }
859     if (tl_nthreads == 1) {
860       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                     "reduced reservation to 1 thread\n",
862                     master_tid));
863       return 1;
864     }
865     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                   "reservation to %d threads\n",
867                   master_tid, tl_nthreads));
868     new_nthreads = tl_nthreads;
869   }
870 
871   // Respect OMP_THREAD_LIMIT
872   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874   if (cg_nthreads + new_nthreads -
875           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876       max_cg_threads) {
877     int tl_nthreads = max_cg_threads - cg_nthreads +
878                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879     if (tl_nthreads <= 0) {
880       tl_nthreads = 1;
881     }
882 
883     // If dyn-var is false, emit a 1-time warning.
884     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885       __kmp_reserve_warn = 1;
886       __kmp_msg(kmp_ms_warning,
887                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889     }
890     if (tl_nthreads == 1) {
891       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                     "reduced reservation to 1 thread\n",
893                     master_tid));
894       return 1;
895     }
896     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                   "reservation to %d threads\n",
898                   master_tid, tl_nthreads));
899     new_nthreads = tl_nthreads;
900   }
901 
902   // Check if the threads array is large enough, or needs expanding.
903   // See comment in __kmp_register_root() about the adjustment if
904   // __kmp_threads[0] == NULL.
905   capacity = __kmp_threads_capacity;
906   if (TCR_PTR(__kmp_threads[0]) == NULL) {
907     --capacity;
908   }
909   if (__kmp_nth + new_nthreads -
910           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911       capacity) {
912     // Expand the threads array.
913     int slotsRequired = __kmp_nth + new_nthreads -
914                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                         capacity;
916     int slotsAdded = __kmp_expand_threads(slotsRequired);
917     if (slotsAdded < slotsRequired) {
918       // The threads array was not expanded enough.
919       new_nthreads -= (slotsRequired - slotsAdded);
920       KMP_ASSERT(new_nthreads >= 1);
921 
922       // If dyn-var is false, emit a 1-time warning.
923       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924         __kmp_reserve_warn = 1;
925         if (__kmp_tp_cached) {
926           __kmp_msg(kmp_ms_warning,
927                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930         } else {
931           __kmp_msg(kmp_ms_warning,
932                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934         }
935       }
936     }
937   }
938 
939 #ifdef KMP_DEBUG
940   if (new_nthreads == 1) {
941     KC_TRACE(10,
942              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943               "dead roots and rechecking; requested %d threads\n",
944               __kmp_get_gtid(), set_nthreads));
945   } else {
946     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                   " %d threads\n",
948                   __kmp_get_gtid(), new_nthreads, set_nthreads));
949   }
950 #endif // KMP_DEBUG
951   return new_nthreads;
952 }
953 
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955    assured that there are enough threads available, because we checked on that
956    earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                     kmp_info_t *master_th, int master_gtid) {
959   int i;
960   int use_hot_team;
961 
962   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964   KMP_MB();
965 
966   /* first, let's setup the master thread */
967   master_th->th.th_info.ds.ds_tid = 0;
968   master_th->th.th_team = team;
969   master_th->th.th_team_nproc = team->t.t_nproc;
970   master_th->th.th_team_master = master_th;
971   master_th->th.th_team_serialized = FALSE;
972   master_th->th.th_dispatch = &team->t.t_dispatch[0];
973 
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976   use_hot_team = 0;
977   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978   if (hot_teams) { // hot teams array is not allocated if
979     // KMP_HOT_TEAMS_MAX_LEVEL=0
980     int level = team->t.t_active_level - 1; // index in array of hot teams
981     if (master_th->th.th_teams_microtask) { // are we inside the teams?
982       if (master_th->th.th_teams_size.nteams > 1) {
983         ++level; // level was not increased in teams construct for
984         // team_of_masters
985       }
986       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987           master_th->th.th_teams_level == team->t.t_level) {
988         ++level; // level was not increased in teams construct for
989         // team_of_workers before the parallel
990       } // team->t.t_level will be increased inside parallel
991     }
992     if (level < __kmp_hot_teams_max_level) {
993       if (hot_teams[level].hot_team) {
994         // hot team has already been allocated for given level
995         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996         use_hot_team = 1; // the team is ready to use
997       } else {
998         use_hot_team = 0; // AC: threads are not allocated yet
999         hot_teams[level].hot_team = team; // remember new hot team
1000         hot_teams[level].hot_team_nth = team->t.t_nproc;
1001       }
1002     } else {
1003       use_hot_team = 0;
1004     }
1005   }
1006 #else
1007   use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009   if (!use_hot_team) {
1010 
1011     /* install the master thread */
1012     team->t.t_threads[0] = master_th;
1013     __kmp_initialize_info(master_th, team, 0, master_gtid);
1014 
1015     /* now, install the worker threads */
1016     for (i = 1; i < team->t.t_nproc; i++) {
1017 
1018       /* fork or reallocate a new thread and install it in team */
1019       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020       team->t.t_threads[i] = thr;
1021       KMP_DEBUG_ASSERT(thr);
1022       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023       /* align team and thread arrived states */
1024       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                     team->t.t_bar[bs_plain_barrier].b_arrived));
1030       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031       thr->th.th_teams_level = master_th->th.th_teams_level;
1032       thr->th.th_teams_size = master_th->th.th_teams_size;
1033       { // Initialize threads' barrier data.
1034         int b;
1035         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036         for (b = 0; b < bs_last_barrier; ++b) {
1037           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042         }
1043       }
1044     }
1045 
1046 #if KMP_AFFINITY_SUPPORTED
1047     __kmp_partition_places(team);
1048 #endif
1049   }
1050 
1051   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052     for (i = 0; i < team->t.t_nproc; i++) {
1053       kmp_info_t *thr = team->t.t_threads[i];
1054       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055           thr->th.th_prev_level != team->t.t_level) {
1056         team->t.t_display_affinity = 1;
1057         break;
1058       }
1059     }
1060   }
1061 
1062   KMP_MB();
1063 }
1064 
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070   if (__kmp_inherit_fp_control) {
1071     kmp_int16 x87_fpu_control_word;
1072     kmp_uint32 mxcsr;
1073 
1074     // Get master values of FPU control flags (both X87 and vector)
1075     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076     __kmp_store_mxcsr(&mxcsr);
1077     mxcsr &= KMP_X86_MXCSR_MASK;
1078 
1079     // There is no point looking at t_fp_control_saved here.
1080     // If it is TRUE, we still have to update the values if they are different
1081     // from those we now have. If it is FALSE we didn't save anything yet, but
1082     // our objective is the same. We have to ensure that the values in the team
1083     // are the same as those we have.
1084     // So, this code achieves what we need whether or not t_fp_control_saved is
1085     // true. By checking whether the value needs updating we avoid unnecessary
1086     // writes that would put the cache-line into a written state, causing all
1087     // threads in the team to have to read it again.
1088     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090     // Although we don't use this value, other code in the runtime wants to know
1091     // whether it should restore them. So we must ensure it is correct.
1092     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093   } else {
1094     // Similarly here. Don't write to this cache-line in the team structure
1095     // unless we have to.
1096     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097   }
1098 }
1099 
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104     // Only reset the fp control regs if they have been changed in the team.
1105     // the parallel region that we are exiting.
1106     kmp_int16 x87_fpu_control_word;
1107     kmp_uint32 mxcsr;
1108     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109     __kmp_store_mxcsr(&mxcsr);
1110     mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113       __kmp_clear_x87_fpu_status_word();
1114       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115     }
1116 
1117     if (team->t.t_mxcsr != mxcsr) {
1118       __kmp_load_mxcsr(&team->t.t_mxcsr);
1119     }
1120   }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126 
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                      int realloc); // forward declaration
1129 
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131    single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133   kmp_info_t *this_thr;
1134   kmp_team_t *serial_team;
1135 
1136   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137 
1138   /* Skip all this code for autopar serialized loops since it results in
1139      unacceptable overhead */
1140   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141     return;
1142 
1143   if (!TCR_4(__kmp_init_parallel))
1144     __kmp_parallel_initialize();
1145   __kmp_resume_if_soft_paused();
1146 
1147   this_thr = __kmp_threads[global_tid];
1148   serial_team = this_thr->th.th_serial_team;
1149 
1150   /* utilize the serialized team held by this thread */
1151   KMP_DEBUG_ASSERT(serial_team);
1152   KMP_MB();
1153 
1154   if (__kmp_tasking_mode != tskm_immediate_exec) {
1155     KMP_DEBUG_ASSERT(
1156         this_thr->th.th_task_team ==
1157         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                      NULL);
1160     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                   "team %p, new task_team = NULL\n",
1162                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163     this_thr->th.th_task_team = NULL;
1164   }
1165 
1166   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168     proc_bind = proc_bind_false;
1169   } else if (proc_bind == proc_bind_default) {
1170     // No proc_bind clause was specified, so use the current value
1171     // of proc-bind-var for this parallel region.
1172     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173   }
1174   // Reset for next parallel region
1175   this_thr->th.th_set_proc_bind = proc_bind_default;
1176 
1177 #if OMPT_SUPPORT
1178   ompt_data_t ompt_parallel_data = ompt_data_none;
1179   ompt_data_t *implicit_task_data;
1180   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181   if (ompt_enabled.enabled &&
1182       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183 
1184     ompt_task_info_t *parent_task_info;
1185     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186 
1187     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188     if (ompt_enabled.ompt_callback_parallel_begin) {
1189       int team_size = 1;
1190 
1191       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192           &(parent_task_info->task_data), &(parent_task_info->frame),
1193           &ompt_parallel_data, team_size,
1194           ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1195     }
1196   }
1197 #endif // OMPT_SUPPORT
1198 
1199   if (this_thr->th.th_team != serial_team) {
1200     // Nested level will be an index in the nested nthreads array
1201     int level = this_thr->th.th_team->t.t_level;
1202 
1203     if (serial_team->t.t_serialized) {
1204       /* this serial team was already used
1205          TODO increase performance by making this locks more specific */
1206       kmp_team_t *new_team;
1207 
1208       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209 
1210       new_team =
1211           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213                               ompt_parallel_data,
1214 #endif
1215                               proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                               0 USE_NESTED_HOT_ARG(NULL));
1217       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218       KMP_ASSERT(new_team);
1219 
1220       /* setup new serialized team and install it */
1221       new_team->t.t_threads[0] = this_thr;
1222       new_team->t.t_parent = this_thr->th.th_team;
1223       serial_team = new_team;
1224       this_thr->th.th_serial_team = serial_team;
1225 
1226       KF_TRACE(
1227           10,
1228           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229            global_tid, serial_team));
1230 
1231       /* TODO the above breaks the requirement that if we run out of resources,
1232          then we can still guarantee that serialized teams are ok, since we may
1233          need to allocate a new one */
1234     } else {
1235       KF_TRACE(
1236           10,
1237           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238            global_tid, serial_team));
1239     }
1240 
1241     /* we have to initialize this serial team */
1242     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245     serial_team->t.t_ident = loc;
1246     serial_team->t.t_serialized = 1;
1247     serial_team->t.t_nproc = 1;
1248     serial_team->t.t_parent = this_thr->th.th_team;
1249     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250     this_thr->th.th_team = serial_team;
1251     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252 
1253     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                   this_thr->th.th_current_task));
1255     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256     this_thr->th.th_current_task->td_flags.executing = 0;
1257 
1258     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259 
1260     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261        implicit task for each serialized task represented by
1262        team->t.t_serialized? */
1263     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264               &this_thr->th.th_current_task->td_parent->td_icvs);
1265 
1266     // Thread value exists in the nested nthreads array for the next nested
1267     // level
1268     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269       this_thr->th.th_current_task->td_icvs.nproc =
1270           __kmp_nested_nth.nth[level + 1];
1271     }
1272 
1273     if (__kmp_nested_proc_bind.used &&
1274         (level + 1 < __kmp_nested_proc_bind.used)) {
1275       this_thr->th.th_current_task->td_icvs.proc_bind =
1276           __kmp_nested_proc_bind.bind_types[level + 1];
1277     }
1278 
1279 #if USE_DEBUGGER
1280     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282     this_thr->th.th_info.ds.ds_tid = 0;
1283 
1284     /* set thread cache values */
1285     this_thr->th.th_team_nproc = 1;
1286     this_thr->th.th_team_master = this_thr;
1287     this_thr->th.th_team_serialized = 1;
1288 
1289     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292 
1293     propagateFPControl(serial_team);
1294 
1295     /* check if we need to allocate dispatch buffers stack */
1296     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298       serial_team->t.t_dispatch->th_disp_buffer =
1299           (dispatch_private_info_t *)__kmp_allocate(
1300               sizeof(dispatch_private_info_t));
1301     }
1302     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303 
1304     KMP_MB();
1305 
1306   } else {
1307     /* this serialized team is already being used,
1308      * that's fine, just add another nested level */
1309     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312     ++serial_team->t.t_serialized;
1313     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314 
1315     // Nested level will be an index in the nested nthreads array
1316     int level = this_thr->th.th_team->t.t_level;
1317     // Thread value exists in the nested nthreads array for the next nested
1318     // level
1319     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320       this_thr->th.th_current_task->td_icvs.nproc =
1321           __kmp_nested_nth.nth[level + 1];
1322     }
1323     serial_team->t.t_level++;
1324     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                   "of serial team %p to %d\n",
1326                   global_tid, serial_team, serial_team->t.t_level));
1327 
1328     /* allocate/push dispatch buffers stack */
1329     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330     {
1331       dispatch_private_info_t *disp_buffer =
1332           (dispatch_private_info_t *)__kmp_allocate(
1333               sizeof(dispatch_private_info_t));
1334       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336     }
1337     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338 
1339     KMP_MB();
1340   }
1341   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342 
1343   // Perform the display affinity functionality for
1344   // serialized parallel regions
1345   if (__kmp_display_affinity) {
1346     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347         this_thr->th.th_prev_num_threads != 1) {
1348       // NULL means use the affinity-format-var ICV
1349       __kmp_aux_display_affinity(global_tid, NULL);
1350       this_thr->th.th_prev_level = serial_team->t.t_level;
1351       this_thr->th.th_prev_num_threads = 1;
1352     }
1353   }
1354 
1355   if (__kmp_env_consistency_check)
1356     __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358   serial_team->t.ompt_team_info.master_return_address = codeptr;
1359   if (ompt_enabled.enabled &&
1360       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362 
1363     ompt_lw_taskteam_t lw_taskteam;
1364     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                             &ompt_parallel_data, codeptr);
1366 
1367     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368     // don't use lw_taskteam after linking. content was swaped
1369 
1370     /* OMPT implicit task begin */
1371     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372     if (ompt_enabled.ompt_callback_implicit_task) {
1373       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376       OMPT_CUR_TASK_INFO(this_thr)
1377           ->thread_num = __kmp_tid_from_gtid(global_tid);
1378     }
1379 
1380     /* OMPT state */
1381     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383   }
1384 #endif
1385 }
1386 
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390                     enum fork_context_e call_context, // Intel, GNU, ...
1391                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394                     va_list *ap
1395 #else
1396                     va_list ap
1397 #endif
1398                     ) {
1399   void **argv;
1400   int i;
1401   int master_tid;
1402   int master_this_cons;
1403   kmp_team_t *team;
1404   kmp_team_t *parent_team;
1405   kmp_info_t *master_th;
1406   kmp_root_t *root;
1407   int nthreads;
1408   int master_active;
1409   int master_set_numthreads;
1410   int level;
1411   int active_level;
1412   int teams_level;
1413 #if KMP_NESTED_HOT_TEAMS
1414   kmp_hot_team_ptr_t **p_hot_teams;
1415 #endif
1416   { // KMP_TIME_BLOCK
1417     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419 
1420     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422       /* Some systems prefer the stack for the root thread(s) to start with */
1423       /* some gap from the parent stack to prevent false sharing. */
1424       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425       /* These 2 lines below are so this does not get optimized out */
1426       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427         __kmp_stkpadding += (short)((kmp_int64)dummy);
1428     }
1429 
1430     /* initialize if needed */
1431     KMP_DEBUG_ASSERT(
1432         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433     if (!TCR_4(__kmp_init_parallel))
1434       __kmp_parallel_initialize();
1435     __kmp_resume_if_soft_paused();
1436 
1437     /* setup current data */
1438     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439     // shutdown
1440     parent_team = master_th->th.th_team;
1441     master_tid = master_th->th.th_info.ds.ds_tid;
1442     master_this_cons = master_th->th.th_local.this_construct;
1443     root = master_th->th.th_root;
1444     master_active = root->r.r_active;
1445     master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448     ompt_data_t ompt_parallel_data = ompt_data_none;
1449     ompt_data_t *parent_task_data;
1450     ompt_frame_t *ompt_frame;
1451     ompt_data_t *implicit_task_data;
1452     void *return_address = NULL;
1453 
1454     if (ompt_enabled.enabled) {
1455       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456                                     NULL, NULL);
1457       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458     }
1459 #endif
1460 
1461     // Nested level will be an index in the nested nthreads array
1462     level = parent_team->t.t_level;
1463     // used to launch non-serial teams even if nested is not allowed
1464     active_level = parent_team->t.t_active_level;
1465     // needed to check nesting inside the teams
1466     teams_level = master_th->th.th_teams_level;
1467 #if KMP_NESTED_HOT_TEAMS
1468     p_hot_teams = &master_th->th.th_hot_teams;
1469     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473       // it is either actual or not needed (when active_level > 0)
1474       (*p_hot_teams)[0].hot_team_nth = 1;
1475     }
1476 #endif
1477 
1478 #if OMPT_SUPPORT
1479     if (ompt_enabled.enabled) {
1480       if (ompt_enabled.ompt_callback_parallel_begin) {
1481         int team_size = master_set_numthreads
1482                             ? master_set_numthreads
1483                             : get__nproc_2(parent_team, master_tid);
1484         int flags = OMPT_INVOKER(call_context) |
1485                     ((microtask == (microtask_t)__kmp_teams_master)
1486                          ? ompt_parallel_league
1487                          : ompt_parallel_team);
1488         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1489             parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1490             return_address);
1491       }
1492       master_th->th.ompt_thread_info.state = ompt_state_overhead;
1493     }
1494 #endif
1495 
1496     master_th->th.th_ident = loc;
1497 
1498     if (master_th->th.th_teams_microtask && ap &&
1499         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1500       // AC: This is start of parallel that is nested inside teams construct.
1501       // The team is actual (hot), all workers are ready at the fork barrier.
1502       // No lock needed to initialize the team a bit, then free workers.
1503       parent_team->t.t_ident = loc;
1504       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1505       parent_team->t.t_argc = argc;
1506       argv = (void **)parent_team->t.t_argv;
1507       for (i = argc - 1; i >= 0; --i)
1508 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1509 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1510         *argv++ = va_arg(*ap, void *);
1511 #else
1512         *argv++ = va_arg(ap, void *);
1513 #endif
1514       // Increment our nested depth levels, but not increase the serialization
1515       if (parent_team == master_th->th.th_serial_team) {
1516         // AC: we are in serialized parallel
1517         __kmpc_serialized_parallel(loc, gtid);
1518         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1519 
1520 #if OMPT_SUPPORT
1521         void *dummy;
1522         void **exit_frame_p;
1523 
1524         ompt_lw_taskteam_t lw_taskteam;
1525 
1526         if (ompt_enabled.enabled) {
1527           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1528                                   &ompt_parallel_data, return_address);
1529           exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1530 
1531           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1532           // don't use lw_taskteam after linking. content was swaped
1533 
1534           /* OMPT implicit task begin */
1535           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1536           if (ompt_enabled.ompt_callback_implicit_task) {
1537             OMPT_CUR_TASK_INFO(master_th)
1538                 ->thread_num = __kmp_tid_from_gtid(gtid);
1539             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1540                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1541                 implicit_task_data, 1,
1542                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543           }
1544 
1545           /* OMPT state */
1546           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1547         } else {
1548           exit_frame_p = &dummy;
1549         }
1550 #endif
1551         // AC: need to decrement t_serialized for enquiry functions to work
1552         // correctly, will restore at join time
1553         parent_team->t.t_serialized--;
1554 
1555         {
1556           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1557           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1558           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1559 #if OMPT_SUPPORT
1560                                  ,
1561                                  exit_frame_p
1562 #endif
1563                                  );
1564         }
1565 
1566 #if OMPT_SUPPORT
1567         if (ompt_enabled.enabled) {
1568           *exit_frame_p = NULL;
1569           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1570           if (ompt_enabled.ompt_callback_implicit_task) {
1571             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1572                 ompt_scope_end, NULL, implicit_task_data, 1,
1573                 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1574           }
1575           ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1576           __ompt_lw_taskteam_unlink(master_th);
1577           if (ompt_enabled.ompt_callback_parallel_end) {
1578             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1579                 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1580                 OMPT_INVOKER(call_context) | ompt_parallel_team,
1581                 return_address);
1582           }
1583           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1584         }
1585 #endif
1586         return TRUE;
1587       }
1588 
1589       parent_team->t.t_pkfn = microtask;
1590       parent_team->t.t_invoke = invoker;
1591       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1592       parent_team->t.t_active_level++;
1593       parent_team->t.t_level++;
1594       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1595 
1596 #if OMPT_SUPPORT
1597       if (ompt_enabled.enabled) {
1598         ompt_lw_taskteam_t lw_taskteam;
1599         __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1600                                 &ompt_parallel_data, return_address);
1601         __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1602       }
1603 #endif
1604 
1605       /* Change number of threads in the team if requested */
1606       if (master_set_numthreads) { // The parallel has num_threads clause
1607         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1608           // AC: only can reduce number of threads dynamically, can't increase
1609           kmp_info_t **other_threads = parent_team->t.t_threads;
1610           parent_team->t.t_nproc = master_set_numthreads;
1611           for (i = 0; i < master_set_numthreads; ++i) {
1612             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1613           }
1614           // Keep extra threads hot in the team for possible next parallels
1615         }
1616         master_th->th.th_set_nproc = 0;
1617       }
1618 
1619 #if USE_DEBUGGER
1620       if (__kmp_debugging) { // Let debugger override number of threads.
1621         int nth = __kmp_omp_num_threads(loc);
1622         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1623           master_set_numthreads = nth;
1624         }
1625       }
1626 #endif
1627 
1628       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1629                     "master_th=%p, gtid=%d\n",
1630                     root, parent_team, master_th, gtid));
1631       __kmp_internal_fork(loc, gtid, parent_team);
1632       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1633                     "master_th=%p, gtid=%d\n",
1634                     root, parent_team, master_th, gtid));
1635 
1636       /* Invoke microtask for MASTER thread */
1637       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1638                     parent_team->t.t_id, parent_team->t.t_pkfn));
1639 
1640       if (!parent_team->t.t_invoke(gtid)) {
1641         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642       }
1643       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1644                     parent_team->t.t_id, parent_team->t.t_pkfn));
1645       KMP_MB(); /* Flush all pending memory write invalidates.  */
1646 
1647       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1648 
1649       return TRUE;
1650     } // Parallel closely nested in teams construct
1651 
1652 #if KMP_DEBUG
1653     if (__kmp_tasking_mode != tskm_immediate_exec) {
1654       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1655                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1656     }
1657 #endif
1658 
1659     if (parent_team->t.t_active_level >=
1660         master_th->th.th_current_task->td_icvs.max_active_levels) {
1661       nthreads = 1;
1662     } else {
1663       int enter_teams = ((ap == NULL && active_level == 0) ||
1664                          (ap && teams_level > 0 && teams_level == level));
1665       nthreads =
1666           master_set_numthreads
1667               ? master_set_numthreads
1668               : get__nproc_2(
1669                     parent_team,
1670                     master_tid); // TODO: get nproc directly from current task
1671 
1672       // Check if we need to take forkjoin lock? (no need for serialized
1673       // parallel out of teams construct). This code moved here from
1674       // __kmp_reserve_threads() to speedup nested serialized parallels.
1675       if (nthreads > 1) {
1676         if ((get__max_active_levels(master_th) == 1 &&
1677              (root->r.r_in_parallel && !enter_teams)) ||
1678             (__kmp_library == library_serial)) {
1679           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1680                         " threads\n",
1681                         gtid, nthreads));
1682           nthreads = 1;
1683         }
1684       }
1685       if (nthreads > 1) {
1686         /* determine how many new threads we can use */
1687         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1688         /* AC: If we execute teams from parallel region (on host), then teams
1689            should be created but each can only have 1 thread if nesting is
1690            disabled. If teams called from serial region, then teams and their
1691            threads should be created regardless of the nesting setting. */
1692         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1693                                          nthreads, enter_teams);
1694         if (nthreads == 1) {
1695           // Free lock for single thread execution here; for multi-thread
1696           // execution it will be freed later after team of threads created
1697           // and initialized
1698           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1699         }
1700       }
1701     }
1702     KMP_DEBUG_ASSERT(nthreads > 0);
1703 
1704     // If we temporarily changed the set number of threads then restore it now
1705     master_th->th.th_set_nproc = 0;
1706 
1707     /* create a serialized parallel region? */
1708     if (nthreads == 1) {
1709 /* josh todo: hypothetical question: what do we do for OS X*? */
1710 #if KMP_OS_LINUX &&                                                            \
1711     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1712       void *args[argc];
1713 #else
1714       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1715 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1716           KMP_ARCH_AARCH64) */
1717 
1718       KA_TRACE(20,
1719                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1720 
1721       __kmpc_serialized_parallel(loc, gtid);
1722 
1723       if (call_context == fork_context_intel) {
1724         /* TODO this sucks, use the compiler itself to pass args! :) */
1725         master_th->th.th_serial_team->t.t_ident = loc;
1726         if (!ap) {
1727           // revert change made in __kmpc_serialized_parallel()
1728           master_th->th.th_serial_team->t.t_level--;
1729 // Get args from parent team for teams construct
1730 
1731 #if OMPT_SUPPORT
1732           void *dummy;
1733           void **exit_frame_p;
1734           ompt_task_info_t *task_info;
1735 
1736           ompt_lw_taskteam_t lw_taskteam;
1737 
1738           if (ompt_enabled.enabled) {
1739             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1740                                     &ompt_parallel_data, return_address);
1741 
1742             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1743             // don't use lw_taskteam after linking. content was swaped
1744 
1745             task_info = OMPT_CUR_TASK_INFO(master_th);
1746             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1747             if (ompt_enabled.ompt_callback_implicit_task) {
1748               OMPT_CUR_TASK_INFO(master_th)
1749                   ->thread_num = __kmp_tid_from_gtid(gtid);
1750               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1751                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1752                   &(task_info->task_data), 1,
1753                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1754                   ompt_task_implicit);
1755             }
1756 
1757             /* OMPT state */
1758             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1759           } else {
1760             exit_frame_p = &dummy;
1761           }
1762 #endif
1763 
1764           {
1765             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1766             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1767             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1768                                    parent_team->t.t_argv
1769 #if OMPT_SUPPORT
1770                                    ,
1771                                    exit_frame_p
1772 #endif
1773                                    );
1774           }
1775 
1776 #if OMPT_SUPPORT
1777           if (ompt_enabled.enabled) {
1778             *exit_frame_p = NULL;
1779             if (ompt_enabled.ompt_callback_implicit_task) {
1780               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1781                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1782                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783                   ompt_task_implicit);
1784             }
1785             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1786             __ompt_lw_taskteam_unlink(master_th);
1787             if (ompt_enabled.ompt_callback_parallel_end) {
1788               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1789                   &ompt_parallel_data, parent_task_data,
1790                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1791                   return_address);
1792             }
1793             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1794           }
1795 #endif
1796         } else if (microtask == (microtask_t)__kmp_teams_master) {
1797           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1798                            master_th->th.th_serial_team);
1799           team = master_th->th.th_team;
1800           // team->t.t_pkfn = microtask;
1801           team->t.t_invoke = invoker;
1802           __kmp_alloc_argv_entries(argc, team, TRUE);
1803           team->t.t_argc = argc;
1804           argv = (void **)team->t.t_argv;
1805           if (ap) {
1806             for (i = argc - 1; i >= 0; --i)
1807 // TODO: revert workaround for Intel(R) 64 tracker #96
1808 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1809               *argv++ = va_arg(*ap, void *);
1810 #else
1811               *argv++ = va_arg(ap, void *);
1812 #endif
1813           } else {
1814             for (i = 0; i < argc; ++i)
1815               // Get args from parent team for teams construct
1816               argv[i] = parent_team->t.t_argv[i];
1817           }
1818           // AC: revert change made in __kmpc_serialized_parallel()
1819           //     because initial code in teams should have level=0
1820           team->t.t_level--;
1821           // AC: call special invoker for outer "parallel" of teams construct
1822           invoker(gtid);
1823 #if OMPT_SUPPORT
1824           if (ompt_enabled.enabled) {
1825             ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1826             if (ompt_enabled.ompt_callback_implicit_task) {
1827               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828                   ompt_scope_end, NULL, &(task_info->task_data), 0,
1829                   OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1830             }
1831             if (ompt_enabled.ompt_callback_parallel_end) {
1832               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1833                   &ompt_parallel_data, parent_task_data,
1834                   OMPT_INVOKER(call_context) | ompt_parallel_league,
1835                   return_address);
1836             }
1837             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1838           }
1839 #endif
1840         } else {
1841           argv = args;
1842           for (i = argc - 1; i >= 0; --i)
1843 // TODO: revert workaround for Intel(R) 64 tracker #96
1844 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1845             *argv++ = va_arg(*ap, void *);
1846 #else
1847             *argv++ = va_arg(ap, void *);
1848 #endif
1849           KMP_MB();
1850 
1851 #if OMPT_SUPPORT
1852           void *dummy;
1853           void **exit_frame_p;
1854           ompt_task_info_t *task_info;
1855 
1856           ompt_lw_taskteam_t lw_taskteam;
1857 
1858           if (ompt_enabled.enabled) {
1859             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1860                                     &ompt_parallel_data, return_address);
1861             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1862             // don't use lw_taskteam after linking. content was swaped
1863             task_info = OMPT_CUR_TASK_INFO(master_th);
1864             exit_frame_p = &(task_info->frame.exit_frame.ptr);
1865 
1866             /* OMPT implicit task begin */
1867             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1868             if (ompt_enabled.ompt_callback_implicit_task) {
1869               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1870                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1871                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1872                   ompt_task_implicit);
1873               OMPT_CUR_TASK_INFO(master_th)
1874                   ->thread_num = __kmp_tid_from_gtid(gtid);
1875             }
1876 
1877             /* OMPT state */
1878             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1879           } else {
1880             exit_frame_p = &dummy;
1881           }
1882 #endif
1883 
1884           {
1885             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1886             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1887             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1888 #if OMPT_SUPPORT
1889                                    ,
1890                                    exit_frame_p
1891 #endif
1892                                    );
1893           }
1894 
1895 #if OMPT_SUPPORT
1896           if (ompt_enabled.enabled) {
1897             *exit_frame_p = NULL;
1898             if (ompt_enabled.ompt_callback_implicit_task) {
1899               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1900                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1901                   OMPT_CUR_TASK_INFO(master_th)->thread_num,
1902                   ompt_task_implicit);
1903             }
1904 
1905             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1906             __ompt_lw_taskteam_unlink(master_th);
1907             if (ompt_enabled.ompt_callback_parallel_end) {
1908               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1909                   &ompt_parallel_data, parent_task_data,
1910                   OMPT_INVOKER(call_context) | ompt_parallel_team,
1911                   return_address);
1912             }
1913             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1914           }
1915 #endif
1916         }
1917       } else if (call_context == fork_context_gnu) {
1918 #if OMPT_SUPPORT
1919         ompt_lw_taskteam_t lwt;
1920         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1921                                 return_address);
1922 
1923         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1924         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1925 // don't use lw_taskteam after linking. content was swaped
1926 #endif
1927 
1928         // we were called from GNU native code
1929         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930         return FALSE;
1931       } else {
1932         KMP_ASSERT2(call_context < fork_context_last,
1933                     "__kmp_fork_call: unknown fork_context parameter");
1934       }
1935 
1936       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1937       KMP_MB();
1938       return FALSE;
1939     } // if (nthreads == 1)
1940 
1941     // GEH: only modify the executing flag in the case when not serialized
1942     //      serialized case is handled in kmpc_serialized_parallel
1943     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1944                   "curtask=%p, curtask_max_aclevel=%d\n",
1945                   parent_team->t.t_active_level, master_th,
1946                   master_th->th.th_current_task,
1947                   master_th->th.th_current_task->td_icvs.max_active_levels));
1948     // TODO: GEH - cannot do this assertion because root thread not set up as
1949     // executing
1950     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1951     master_th->th.th_current_task->td_flags.executing = 0;
1952 
1953     if (!master_th->th.th_teams_microtask || level > teams_level) {
1954       /* Increment our nested depth level */
1955       KMP_ATOMIC_INC(&root->r.r_in_parallel);
1956     }
1957 
1958     // See if we need to make a copy of the ICVs.
1959     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1960     if ((level + 1 < __kmp_nested_nth.used) &&
1961         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1962       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1963     } else {
1964       nthreads_icv = 0; // don't update
1965     }
1966 
1967     // Figure out the proc_bind_policy for the new team.
1968     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1969     kmp_proc_bind_t proc_bind_icv =
1970         proc_bind_default; // proc_bind_default means don't update
1971     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1972       proc_bind = proc_bind_false;
1973     } else {
1974       if (proc_bind == proc_bind_default) {
1975         // No proc_bind clause specified; use current proc-bind-var for this
1976         // parallel region
1977         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1978       }
1979       /* else: The proc_bind policy was specified explicitly on parallel clause.
1980          This overrides proc-bind-var for this parallel region, but does not
1981          change proc-bind-var. */
1982       // Figure the value of proc-bind-var for the child threads.
1983       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1984           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1985            master_th->th.th_current_task->td_icvs.proc_bind)) {
1986         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1987       }
1988     }
1989 
1990     // Reset for next parallel region
1991     master_th->th.th_set_proc_bind = proc_bind_default;
1992 
1993     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1994       kmp_internal_control_t new_icvs;
1995       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1996       new_icvs.next = NULL;
1997       if (nthreads_icv > 0) {
1998         new_icvs.nproc = nthreads_icv;
1999       }
2000       if (proc_bind_icv != proc_bind_default) {
2001         new_icvs.proc_bind = proc_bind_icv;
2002       }
2003 
2004       /* allocate a new parallel team */
2005       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2006       team = __kmp_allocate_team(root, nthreads, nthreads,
2007 #if OMPT_SUPPORT
2008                                  ompt_parallel_data,
2009 #endif
2010                                  proc_bind, &new_icvs,
2011                                  argc USE_NESTED_HOT_ARG(master_th));
2012     } else {
2013       /* allocate a new parallel team */
2014       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2015       team = __kmp_allocate_team(root, nthreads, nthreads,
2016 #if OMPT_SUPPORT
2017                                  ompt_parallel_data,
2018 #endif
2019                                  proc_bind,
2020                                  &master_th->th.th_current_task->td_icvs,
2021                                  argc USE_NESTED_HOT_ARG(master_th));
2022     }
2023     KF_TRACE(
2024         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2025 
2026     /* setup the new team */
2027     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2028     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2029     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2030     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2031     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2032 #if OMPT_SUPPORT
2033     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2034                           return_address);
2035 #endif
2036     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2037     // TODO: parent_team->t.t_level == INT_MAX ???
2038     if (!master_th->th.th_teams_microtask || level > teams_level) {
2039       int new_level = parent_team->t.t_level + 1;
2040       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2041       new_level = parent_team->t.t_active_level + 1;
2042       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2043     } else {
2044       // AC: Do not increase parallel level at start of the teams construct
2045       int new_level = parent_team->t.t_level;
2046       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2047       new_level = parent_team->t.t_active_level;
2048       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2049     }
2050     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2051     // set master's schedule as new run-time schedule
2052     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2053 
2054     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2055     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2056 
2057     // Update the floating point rounding in the team if required.
2058     propagateFPControl(team);
2059 
2060     if (__kmp_tasking_mode != tskm_immediate_exec) {
2061       // Set master's task team to team's task team. Unless this is hot team, it
2062       // should be NULL.
2063       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2064                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2065       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2066                     "%p, new task_team %p / team %p\n",
2067                     __kmp_gtid_from_thread(master_th),
2068                     master_th->th.th_task_team, parent_team,
2069                     team->t.t_task_team[master_th->th.th_task_state], team));
2070 
2071       if (active_level || master_th->th.th_task_team) {
2072         // Take a memo of master's task_state
2073         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2074         if (master_th->th.th_task_state_top >=
2075             master_th->th.th_task_state_stack_sz) { // increase size
2076           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2077           kmp_uint8 *old_stack, *new_stack;
2078           kmp_uint32 i;
2079           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2080           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2081             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2082           }
2083           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2084                ++i) { // zero-init rest of stack
2085             new_stack[i] = 0;
2086           }
2087           old_stack = master_th->th.th_task_state_memo_stack;
2088           master_th->th.th_task_state_memo_stack = new_stack;
2089           master_th->th.th_task_state_stack_sz = new_size;
2090           __kmp_free(old_stack);
2091         }
2092         // Store master's task_state on stack
2093         master_th->th
2094             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2095             master_th->th.th_task_state;
2096         master_th->th.th_task_state_top++;
2097 #if KMP_NESTED_HOT_TEAMS
2098         if (master_th->th.th_hot_teams &&
2099             active_level < __kmp_hot_teams_max_level &&
2100             team == master_th->th.th_hot_teams[active_level].hot_team) {
2101           // Restore master's nested state if nested hot team
2102           master_th->th.th_task_state =
2103               master_th->th
2104                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2105         } else {
2106 #endif
2107           master_th->th.th_task_state = 0;
2108 #if KMP_NESTED_HOT_TEAMS
2109         }
2110 #endif
2111       }
2112 #if !KMP_NESTED_HOT_TEAMS
2113       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2114                        (team == root->r.r_hot_team));
2115 #endif
2116     }
2117 
2118     KA_TRACE(
2119         20,
2120         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2121          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2122          team->t.t_nproc));
2123     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2124                      (team->t.t_master_tid == 0 &&
2125                       (team->t.t_parent == root->r.r_root_team ||
2126                        team->t.t_parent->t.t_serialized)));
2127     KMP_MB();
2128 
2129     /* now, setup the arguments */
2130     argv = (void **)team->t.t_argv;
2131     if (ap) {
2132       for (i = argc - 1; i >= 0; --i) {
2133 // TODO: revert workaround for Intel(R) 64 tracker #96
2134 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2135         void *new_argv = va_arg(*ap, void *);
2136 #else
2137         void *new_argv = va_arg(ap, void *);
2138 #endif
2139         KMP_CHECK_UPDATE(*argv, new_argv);
2140         argv++;
2141       }
2142     } else {
2143       for (i = 0; i < argc; ++i) {
2144         // Get args from parent team for teams construct
2145         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2146       }
2147     }
2148 
2149     /* now actually fork the threads */
2150     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2151     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2152       root->r.r_active = TRUE;
2153 
2154     __kmp_fork_team_threads(root, team, master_th, gtid);
2155     __kmp_setup_icv_copy(team, nthreads,
2156                          &master_th->th.th_current_task->td_icvs, loc);
2157 
2158 #if OMPT_SUPPORT
2159     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2160 #endif
2161 
2162     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2163 
2164 #if USE_ITT_BUILD
2165     if (team->t.t_active_level == 1 // only report frames at level 1
2166         && !master_th->th.th_teams_microtask) { // not in teams construct
2167 #if USE_ITT_NOTIFY
2168       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2169           (__kmp_forkjoin_frames_mode == 3 ||
2170            __kmp_forkjoin_frames_mode == 1)) {
2171         kmp_uint64 tmp_time = 0;
2172         if (__itt_get_timestamp_ptr)
2173           tmp_time = __itt_get_timestamp();
2174         // Internal fork - report frame begin
2175         master_th->th.th_frame_time = tmp_time;
2176         if (__kmp_forkjoin_frames_mode == 3)
2177           team->t.t_region_time = tmp_time;
2178       } else
2179 // only one notification scheme (either "submit" or "forking/joined", not both)
2180 #endif /* USE_ITT_NOTIFY */
2181           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2182               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2183         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2184         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2185       }
2186     }
2187 #endif /* USE_ITT_BUILD */
2188 
2189     /* now go on and do the work */
2190     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2191     KMP_MB();
2192     KF_TRACE(10,
2193              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2194               root, team, master_th, gtid));
2195 
2196 #if USE_ITT_BUILD
2197     if (__itt_stack_caller_create_ptr) {
2198       team->t.t_stack_id =
2199           __kmp_itt_stack_caller_create(); // create new stack stitching id
2200       // before entering fork barrier
2201     }
2202 #endif /* USE_ITT_BUILD */
2203 
2204     // AC: skip __kmp_internal_fork at teams construct, let only master
2205     // threads execute
2206     if (ap) {
2207       __kmp_internal_fork(loc, gtid, team);
2208       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2209                     "master_th=%p, gtid=%d\n",
2210                     root, team, master_th, gtid));
2211     }
2212 
2213     if (call_context == fork_context_gnu) {
2214       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215       return TRUE;
2216     }
2217 
2218     /* Invoke microtask for MASTER thread */
2219     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2220                   team->t.t_id, team->t.t_pkfn));
2221   } // END of timer KMP_fork_call block
2222 
2223 #if KMP_STATS_ENABLED
2224   // If beginning a teams construct, then change thread state
2225   stats_state_e previous_state = KMP_GET_THREAD_STATE();
2226   if (!ap) {
2227     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2228   }
2229 #endif
2230 
2231   if (!team->t.t_invoke(gtid)) {
2232     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2233   }
2234 
2235 #if KMP_STATS_ENABLED
2236   // If was beginning of a teams construct, then reset thread state
2237   if (!ap) {
2238     KMP_SET_THREAD_STATE(previous_state);
2239   }
2240 #endif
2241 
2242   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2243                 team->t.t_id, team->t.t_pkfn));
2244   KMP_MB(); /* Flush all pending memory write invalidates.  */
2245 
2246   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2247 
2248 #if OMPT_SUPPORT
2249   if (ompt_enabled.enabled) {
2250     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2251   }
2252 #endif
2253 
2254   return TRUE;
2255 }
2256 
2257 #if OMPT_SUPPORT
2258 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2259                                             kmp_team_t *team) {
2260   // restore state outside the region
2261   thread->th.ompt_thread_info.state =
2262       ((team->t.t_serialized) ? ompt_state_work_serial
2263                               : ompt_state_work_parallel);
2264 }
2265 
2266 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2267                                    kmp_team_t *team, ompt_data_t *parallel_data,
2268                                    int flags, void *codeptr) {
2269   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2270   if (ompt_enabled.ompt_callback_parallel_end) {
2271     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2272         parallel_data, &(task_info->task_data), flags, codeptr);
2273   }
2274 
2275   task_info->frame.enter_frame = ompt_data_none;
2276   __kmp_join_restore_state(thread, team);
2277 }
2278 #endif
2279 
2280 void __kmp_join_call(ident_t *loc, int gtid
2281 #if OMPT_SUPPORT
2282                      ,
2283                      enum fork_context_e fork_context
2284 #endif
2285                      ,
2286                      int exit_teams) {
2287   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2288   kmp_team_t *team;
2289   kmp_team_t *parent_team;
2290   kmp_info_t *master_th;
2291   kmp_root_t *root;
2292   int master_active;
2293 
2294   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2295 
2296   /* setup current data */
2297   master_th = __kmp_threads[gtid];
2298   root = master_th->th.th_root;
2299   team = master_th->th.th_team;
2300   parent_team = team->t.t_parent;
2301 
2302   master_th->th.th_ident = loc;
2303 
2304 #if OMPT_SUPPORT
2305   void *team_microtask = (void *)team->t.t_pkfn;
2306   if (ompt_enabled.enabled) {
2307     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2308   }
2309 #endif
2310 
2311 #if KMP_DEBUG
2312   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2313     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2314                   "th_task_team = %p\n",
2315                   __kmp_gtid_from_thread(master_th), team,
2316                   team->t.t_task_team[master_th->th.th_task_state],
2317                   master_th->th.th_task_team));
2318     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2319                      team->t.t_task_team[master_th->th.th_task_state]);
2320   }
2321 #endif
2322 
2323   if (team->t.t_serialized) {
2324     if (master_th->th.th_teams_microtask) {
2325       // We are in teams construct
2326       int level = team->t.t_level;
2327       int tlevel = master_th->th.th_teams_level;
2328       if (level == tlevel) {
2329         // AC: we haven't incremented it earlier at start of teams construct,
2330         //     so do it here - at the end of teams construct
2331         team->t.t_level++;
2332       } else if (level == tlevel + 1) {
2333         // AC: we are exiting parallel inside teams, need to increment
2334         // serialization in order to restore it in the next call to
2335         // __kmpc_end_serialized_parallel
2336         team->t.t_serialized++;
2337       }
2338     }
2339     __kmpc_end_serialized_parallel(loc, gtid);
2340 
2341 #if OMPT_SUPPORT
2342     if (ompt_enabled.enabled) {
2343       __kmp_join_restore_state(master_th, parent_team);
2344     }
2345 #endif
2346 
2347     return;
2348   }
2349 
2350   master_active = team->t.t_master_active;
2351 
2352   if (!exit_teams) {
2353     // AC: No barrier for internal teams at exit from teams construct.
2354     //     But there is barrier for external team (league).
2355     __kmp_internal_join(loc, gtid, team);
2356   } else {
2357     master_th->th.th_task_state =
2358         0; // AC: no tasking in teams (out of any parallel)
2359   }
2360 
2361   KMP_MB();
2362 
2363 #if OMPT_SUPPORT
2364   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2365   void *codeptr = team->t.ompt_team_info.master_return_address;
2366 #endif
2367 
2368 #if USE_ITT_BUILD
2369   if (__itt_stack_caller_create_ptr) {
2370     __kmp_itt_stack_caller_destroy(
2371         (__itt_caller)team->t
2372             .t_stack_id); // destroy the stack stitching id after join barrier
2373   }
2374 
2375   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2376   if (team->t.t_active_level == 1 &&
2377       !master_th->th.th_teams_microtask) { /* not in teams construct */
2378     master_th->th.th_ident = loc;
2379     // only one notification scheme (either "submit" or "forking/joined", not
2380     // both)
2381     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382         __kmp_forkjoin_frames_mode == 3)
2383       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2384                              master_th->th.th_frame_time, 0, loc,
2385                              master_th->th.th_team_nproc, 1);
2386     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2387              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2388       __kmp_itt_region_joined(gtid);
2389   } // active_level == 1
2390 #endif /* USE_ITT_BUILD */
2391 
2392   if (master_th->th.th_teams_microtask && !exit_teams &&
2393       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2394       team->t.t_level == master_th->th.th_teams_level + 1) {
2395 // AC: We need to leave the team structure intact at the end of parallel
2396 // inside the teams construct, so that at the next parallel same (hot) team
2397 // works, only adjust nesting levels
2398 #if OMPT_SUPPORT
2399     ompt_data_t ompt_parallel_data = ompt_data_none;
2400     if (ompt_enabled.enabled) {
2401       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2402       if (ompt_enabled.ompt_callback_implicit_task) {
2403         int ompt_team_size = team->t.t_nproc;
2404         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2405             ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2406             OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2407       }
2408       task_info->frame.exit_frame = ompt_data_none;
2409       task_info->task_data = ompt_data_none;
2410       ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2411       __ompt_lw_taskteam_unlink(master_th);
2412     }
2413 #endif
2414     /* Decrement our nested depth level */
2415     team->t.t_level--;
2416     team->t.t_active_level--;
2417     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2418 
2419     // Restore number of threads in the team if needed. This code relies on
2420     // the proper adjustment of th_teams_size.nth after the fork in
2421     // __kmp_teams_master on each teams master in the case that
2422     // __kmp_reserve_threads reduced it.
2423     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2424       int old_num = master_th->th.th_team_nproc;
2425       int new_num = master_th->th.th_teams_size.nth;
2426       kmp_info_t **other_threads = team->t.t_threads;
2427       team->t.t_nproc = new_num;
2428       for (int i = 0; i < old_num; ++i) {
2429         other_threads[i]->th.th_team_nproc = new_num;
2430       }
2431       // Adjust states of non-used threads of the team
2432       for (int i = old_num; i < new_num; ++i) {
2433         // Re-initialize thread's barrier data.
2434         KMP_DEBUG_ASSERT(other_threads[i]);
2435         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2436         for (int b = 0; b < bs_last_barrier; ++b) {
2437           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2438           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2439 #if USE_DEBUGGER
2440           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2441 #endif
2442         }
2443         if (__kmp_tasking_mode != tskm_immediate_exec) {
2444           // Synchronize thread's task state
2445           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2446         }
2447       }
2448     }
2449 
2450 #if OMPT_SUPPORT
2451     if (ompt_enabled.enabled) {
2452       __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2453                       OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2454     }
2455 #endif
2456 
2457     return;
2458   }
2459 
2460   /* do cleanup and restore the parent team */
2461   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2462   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2463 
2464   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2465 
2466   /* jc: The following lock has instructions with REL and ACQ semantics,
2467      separating the parallel user code called in this parallel region
2468      from the serial user code called after this function returns. */
2469   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2470 
2471   if (!master_th->th.th_teams_microtask ||
2472       team->t.t_level > master_th->th.th_teams_level) {
2473     /* Decrement our nested depth level */
2474     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2475   }
2476   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2477 
2478 #if OMPT_SUPPORT
2479   if (ompt_enabled.enabled) {
2480     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2481     if (ompt_enabled.ompt_callback_implicit_task) {
2482       int flags = (team_microtask == (void *)__kmp_teams_master)
2483                       ? ompt_task_initial
2484                       : ompt_task_implicit;
2485       int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2486       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2487           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2488           OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2489     }
2490     task_info->frame.exit_frame = ompt_data_none;
2491     task_info->task_data = ompt_data_none;
2492   }
2493 #endif
2494 
2495   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2496                 master_th, team));
2497   __kmp_pop_current_task_from_thread(master_th);
2498 
2499 #if KMP_AFFINITY_SUPPORTED
2500   // Restore master thread's partition.
2501   master_th->th.th_first_place = team->t.t_first_place;
2502   master_th->th.th_last_place = team->t.t_last_place;
2503 #endif // KMP_AFFINITY_SUPPORTED
2504   master_th->th.th_def_allocator = team->t.t_def_allocator;
2505 
2506   updateHWFPControl(team);
2507 
2508   if (root->r.r_active != master_active)
2509     root->r.r_active = master_active;
2510 
2511   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2512                             master_th)); // this will free worker threads
2513 
2514   /* this race was fun to find. make sure the following is in the critical
2515      region otherwise assertions may fail occasionally since the old team may be
2516      reallocated and the hierarchy appears inconsistent. it is actually safe to
2517      run and won't cause any bugs, but will cause those assertion failures. it's
2518      only one deref&assign so might as well put this in the critical region */
2519   master_th->th.th_team = parent_team;
2520   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2521   master_th->th.th_team_master = parent_team->t.t_threads[0];
2522   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2523 
2524   /* restore serialized team, if need be */
2525   if (parent_team->t.t_serialized &&
2526       parent_team != master_th->th.th_serial_team &&
2527       parent_team != root->r.r_root_team) {
2528     __kmp_free_team(root,
2529                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2530     master_th->th.th_serial_team = parent_team;
2531   }
2532 
2533   if (__kmp_tasking_mode != tskm_immediate_exec) {
2534     if (master_th->th.th_task_state_top >
2535         0) { // Restore task state from memo stack
2536       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2537       // Remember master's state if we re-use this nested hot team
2538       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2539           master_th->th.th_task_state;
2540       --master_th->th.th_task_state_top; // pop
2541       // Now restore state at this level
2542       master_th->th.th_task_state =
2543           master_th->th
2544               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2545     }
2546     // Copy the task team from the parent team to the master thread
2547     master_th->th.th_task_team =
2548         parent_team->t.t_task_team[master_th->th.th_task_state];
2549     KA_TRACE(20,
2550              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2551               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2552               parent_team));
2553   }
2554 
2555   // TODO: GEH - cannot do this assertion because root thread not set up as
2556   // executing
2557   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2558   master_th->th.th_current_task->td_flags.executing = 1;
2559 
2560   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2561 
2562 #if OMPT_SUPPORT
2563   int flags =
2564       OMPT_INVOKER(fork_context) |
2565       ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2566                                                       : ompt_parallel_team);
2567   if (ompt_enabled.enabled) {
2568     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2569                     codeptr);
2570   }
2571 #endif
2572 
2573   KMP_MB();
2574   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2575 }
2576 
2577 /* Check whether we should push an internal control record onto the
2578    serial team stack.  If so, do it.  */
2579 void __kmp_save_internal_controls(kmp_info_t *thread) {
2580 
2581   if (thread->th.th_team != thread->th.th_serial_team) {
2582     return;
2583   }
2584   if (thread->th.th_team->t.t_serialized > 1) {
2585     int push = 0;
2586 
2587     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2588       push = 1;
2589     } else {
2590       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2591           thread->th.th_team->t.t_serialized) {
2592         push = 1;
2593       }
2594     }
2595     if (push) { /* push a record on the serial team's stack */
2596       kmp_internal_control_t *control =
2597           (kmp_internal_control_t *)__kmp_allocate(
2598               sizeof(kmp_internal_control_t));
2599 
2600       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2601 
2602       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2603 
2604       control->next = thread->th.th_team->t.t_control_stack_top;
2605       thread->th.th_team->t.t_control_stack_top = control;
2606     }
2607   }
2608 }
2609 
2610 /* Changes set_nproc */
2611 void __kmp_set_num_threads(int new_nth, int gtid) {
2612   kmp_info_t *thread;
2613   kmp_root_t *root;
2614 
2615   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2616   KMP_DEBUG_ASSERT(__kmp_init_serial);
2617 
2618   if (new_nth < 1)
2619     new_nth = 1;
2620   else if (new_nth > __kmp_max_nth)
2621     new_nth = __kmp_max_nth;
2622 
2623   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2624   thread = __kmp_threads[gtid];
2625   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2626     return; // nothing to do
2627 
2628   __kmp_save_internal_controls(thread);
2629 
2630   set__nproc(thread, new_nth);
2631 
2632   // If this omp_set_num_threads() call will cause the hot team size to be
2633   // reduced (in the absence of a num_threads clause), then reduce it now,
2634   // rather than waiting for the next parallel region.
2635   root = thread->th.th_root;
2636   if (__kmp_init_parallel && (!root->r.r_active) &&
2637       (root->r.r_hot_team->t.t_nproc > new_nth)
2638 #if KMP_NESTED_HOT_TEAMS
2639       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2640 #endif
2641       ) {
2642     kmp_team_t *hot_team = root->r.r_hot_team;
2643     int f;
2644 
2645     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2646 
2647     // Release the extra threads we don't need any more.
2648     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2649       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2650       if (__kmp_tasking_mode != tskm_immediate_exec) {
2651         // When decreasing team size, threads no longer in the team should unref
2652         // task team.
2653         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2654       }
2655       __kmp_free_thread(hot_team->t.t_threads[f]);
2656       hot_team->t.t_threads[f] = NULL;
2657     }
2658     hot_team->t.t_nproc = new_nth;
2659 #if KMP_NESTED_HOT_TEAMS
2660     if (thread->th.th_hot_teams) {
2661       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2662       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2663     }
2664 #endif
2665 
2666     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2667 
2668     // Update the t_nproc field in the threads that are still active.
2669     for (f = 0; f < new_nth; f++) {
2670       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2671       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2672     }
2673     // Special flag in case omp_set_num_threads() call
2674     hot_team->t.t_size_changed = -1;
2675   }
2676 }
2677 
2678 /* Changes max_active_levels */
2679 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2680   kmp_info_t *thread;
2681 
2682   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2683                 "%d = (%d)\n",
2684                 gtid, max_active_levels));
2685   KMP_DEBUG_ASSERT(__kmp_init_serial);
2686 
2687   // validate max_active_levels
2688   if (max_active_levels < 0) {
2689     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2690     // We ignore this call if the user has specified a negative value.
2691     // The current setting won't be changed. The last valid setting will be
2692     // used. A warning will be issued (if warnings are allowed as controlled by
2693     // the KMP_WARNINGS env var).
2694     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2695                   "max_active_levels for thread %d = (%d)\n",
2696                   gtid, max_active_levels));
2697     return;
2698   }
2699   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2700     // it's OK, the max_active_levels is within the valid range: [ 0;
2701     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2702     // We allow a zero value. (implementation defined behavior)
2703   } else {
2704     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2705                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2706     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2707     // Current upper limit is MAX_INT. (implementation defined behavior)
2708     // If the input exceeds the upper limit, we correct the input to be the
2709     // upper limit. (implementation defined behavior)
2710     // Actually, the flow should never get here until we use MAX_INT limit.
2711   }
2712   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2713                 "max_active_levels for thread %d = (%d)\n",
2714                 gtid, max_active_levels));
2715 
2716   thread = __kmp_threads[gtid];
2717 
2718   __kmp_save_internal_controls(thread);
2719 
2720   set__max_active_levels(thread, max_active_levels);
2721 }
2722 
2723 /* Gets max_active_levels */
2724 int __kmp_get_max_active_levels(int gtid) {
2725   kmp_info_t *thread;
2726 
2727   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2728   KMP_DEBUG_ASSERT(__kmp_init_serial);
2729 
2730   thread = __kmp_threads[gtid];
2731   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2732   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2733                 "curtask_maxaclevel=%d\n",
2734                 gtid, thread->th.th_current_task,
2735                 thread->th.th_current_task->td_icvs.max_active_levels));
2736   return thread->th.th_current_task->td_icvs.max_active_levels;
2737 }
2738 
2739 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2740 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744   kmp_info_t *thread;
2745   kmp_sched_t orig_kind;
2746   //    kmp_team_t *team;
2747 
2748   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2749                 gtid, (int)kind, chunk));
2750   KMP_DEBUG_ASSERT(__kmp_init_serial);
2751 
2752   // Check if the kind parameter is valid, correct if needed.
2753   // Valid parameters should fit in one of two intervals - standard or extended:
2754   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2755   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2756   orig_kind = kind;
2757   kind = __kmp_sched_without_mods(kind);
2758 
2759   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2760       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2761     // TODO: Hint needs attention in case we change the default schedule.
2762     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2763               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2764               __kmp_msg_null);
2765     kind = kmp_sched_default;
2766     chunk = 0; // ignore chunk value in case of bad kind
2767   }
2768 
2769   thread = __kmp_threads[gtid];
2770 
2771   __kmp_save_internal_controls(thread);
2772 
2773   if (kind < kmp_sched_upper_std) {
2774     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2775       // differ static chunked vs. unchunked:  chunk should be invalid to
2776       // indicate unchunked schedule (which is the default)
2777       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2778     } else {
2779       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2780           __kmp_sch_map[kind - kmp_sched_lower - 1];
2781     }
2782   } else {
2783     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2784     //    kmp_sched_lower - 2 ];
2785     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2786         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2787                       kmp_sched_lower - 2];
2788   }
2789   __kmp_sched_apply_mods_intkind(
2790       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2791   if (kind == kmp_sched_auto || chunk < 1) {
2792     // ignore parameter chunk for schedule auto
2793     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2794   } else {
2795     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2796   }
2797 }
2798 
2799 /* Gets def_sched_var ICV values */
2800 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2801   kmp_info_t *thread;
2802   enum sched_type th_type;
2803 
2804   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2805   KMP_DEBUG_ASSERT(__kmp_init_serial);
2806 
2807   thread = __kmp_threads[gtid];
2808 
2809   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2810   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2811   case kmp_sch_static:
2812   case kmp_sch_static_greedy:
2813   case kmp_sch_static_balanced:
2814     *kind = kmp_sched_static;
2815     __kmp_sched_apply_mods_stdkind(kind, th_type);
2816     *chunk = 0; // chunk was not set, try to show this fact via zero value
2817     return;
2818   case kmp_sch_static_chunked:
2819     *kind = kmp_sched_static;
2820     break;
2821   case kmp_sch_dynamic_chunked:
2822     *kind = kmp_sched_dynamic;
2823     break;
2824   case kmp_sch_guided_chunked:
2825   case kmp_sch_guided_iterative_chunked:
2826   case kmp_sch_guided_analytical_chunked:
2827     *kind = kmp_sched_guided;
2828     break;
2829   case kmp_sch_auto:
2830     *kind = kmp_sched_auto;
2831     break;
2832   case kmp_sch_trapezoidal:
2833     *kind = kmp_sched_trapezoidal;
2834     break;
2835 #if KMP_STATIC_STEAL_ENABLED
2836   case kmp_sch_static_steal:
2837     *kind = kmp_sched_static_steal;
2838     break;
2839 #endif
2840   default:
2841     KMP_FATAL(UnknownSchedulingType, th_type);
2842   }
2843 
2844   __kmp_sched_apply_mods_stdkind(kind, th_type);
2845   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2846 }
2847 
2848 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2849 
2850   int ii, dd;
2851   kmp_team_t *team;
2852   kmp_info_t *thr;
2853 
2854   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2855   KMP_DEBUG_ASSERT(__kmp_init_serial);
2856 
2857   // validate level
2858   if (level == 0)
2859     return 0;
2860   if (level < 0)
2861     return -1;
2862   thr = __kmp_threads[gtid];
2863   team = thr->th.th_team;
2864   ii = team->t.t_level;
2865   if (level > ii)
2866     return -1;
2867 
2868   if (thr->th.th_teams_microtask) {
2869     // AC: we are in teams region where multiple nested teams have same level
2870     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2871     if (level <=
2872         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2873       KMP_DEBUG_ASSERT(ii >= tlevel);
2874       // AC: As we need to pass by the teams league, we need to artificially
2875       // increase ii
2876       if (ii == tlevel) {
2877         ii += 2; // three teams have same level
2878       } else {
2879         ii++; // two teams have same level
2880       }
2881     }
2882   }
2883 
2884   if (ii == level)
2885     return __kmp_tid_from_gtid(gtid);
2886 
2887   dd = team->t.t_serialized;
2888   level++;
2889   while (ii > level) {
2890     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2891     }
2892     if ((team->t.t_serialized) && (!dd)) {
2893       team = team->t.t_parent;
2894       continue;
2895     }
2896     if (ii > level) {
2897       team = team->t.t_parent;
2898       dd = team->t.t_serialized;
2899       ii--;
2900     }
2901   }
2902 
2903   return (dd > 1) ? (0) : (team->t.t_master_tid);
2904 }
2905 
2906 int __kmp_get_team_size(int gtid, int level) {
2907 
2908   int ii, dd;
2909   kmp_team_t *team;
2910   kmp_info_t *thr;
2911 
2912   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2913   KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915   // validate level
2916   if (level == 0)
2917     return 1;
2918   if (level < 0)
2919     return -1;
2920   thr = __kmp_threads[gtid];
2921   team = thr->th.th_team;
2922   ii = team->t.t_level;
2923   if (level > ii)
2924     return -1;
2925 
2926   if (thr->th.th_teams_microtask) {
2927     // AC: we are in teams region where multiple nested teams have same level
2928     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2929     if (level <=
2930         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2931       KMP_DEBUG_ASSERT(ii >= tlevel);
2932       // AC: As we need to pass by the teams league, we need to artificially
2933       // increase ii
2934       if (ii == tlevel) {
2935         ii += 2; // three teams have same level
2936       } else {
2937         ii++; // two teams have same level
2938       }
2939     }
2940   }
2941 
2942   while (ii > level) {
2943     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2944     }
2945     if (team->t.t_serialized && (!dd)) {
2946       team = team->t.t_parent;
2947       continue;
2948     }
2949     if (ii > level) {
2950       team = team->t.t_parent;
2951       ii--;
2952     }
2953   }
2954 
2955   return team->t.t_nproc;
2956 }
2957 
2958 kmp_r_sched_t __kmp_get_schedule_global() {
2959   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2960   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2961   // independently. So one can get the updated schedule here.
2962 
2963   kmp_r_sched_t r_sched;
2964 
2965   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2966   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2967   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2968   // different roots (even in OMP 2.5)
2969   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2970   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2971   if (s == kmp_sch_static) {
2972     // replace STATIC with more detailed schedule (balanced or greedy)
2973     r_sched.r_sched_type = __kmp_static;
2974   } else if (s == kmp_sch_guided_chunked) {
2975     // replace GUIDED with more detailed schedule (iterative or analytical)
2976     r_sched.r_sched_type = __kmp_guided;
2977   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2978     r_sched.r_sched_type = __kmp_sched;
2979   }
2980   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2981 
2982   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2983     // __kmp_chunk may be wrong here (if it was not ever set)
2984     r_sched.chunk = KMP_DEFAULT_CHUNK;
2985   } else {
2986     r_sched.chunk = __kmp_chunk;
2987   }
2988 
2989   return r_sched;
2990 }
2991 
2992 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2993    at least argc number of *t_argv entries for the requested team. */
2994 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2995 
2996   KMP_DEBUG_ASSERT(team);
2997   if (!realloc || argc > team->t.t_max_argc) {
2998 
2999     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3000                    "current entries=%d\n",
3001                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3002     /* if previously allocated heap space for args, free them */
3003     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3004       __kmp_free((void *)team->t.t_argv);
3005 
3006     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3007       /* use unused space in the cache line for arguments */
3008       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3009       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3010                      "argv entries\n",
3011                      team->t.t_id, team->t.t_max_argc));
3012       team->t.t_argv = &team->t.t_inline_argv[0];
3013       if (__kmp_storage_map) {
3014         __kmp_print_storage_map_gtid(
3015             -1, &team->t.t_inline_argv[0],
3016             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3017             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3018             team->t.t_id);
3019       }
3020     } else {
3021       /* allocate space for arguments in the heap */
3022       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3023                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3024                                : 2 * argc;
3025       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3026                      "argv entries\n",
3027                      team->t.t_id, team->t.t_max_argc));
3028       team->t.t_argv =
3029           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3030       if (__kmp_storage_map) {
3031         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3032                                      &team->t.t_argv[team->t.t_max_argc],
3033                                      sizeof(void *) * team->t.t_max_argc,
3034                                      "team_%d.t_argv", team->t.t_id);
3035       }
3036     }
3037   }
3038 }
3039 
3040 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3041   int i;
3042   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3043   team->t.t_threads =
3044       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3045   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3046       sizeof(dispatch_shared_info_t) * num_disp_buff);
3047   team->t.t_dispatch =
3048       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3049   team->t.t_implicit_task_taskdata =
3050       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3051   team->t.t_max_nproc = max_nth;
3052 
3053   /* setup dispatch buffers */
3054   for (i = 0; i < num_disp_buff; ++i) {
3055     team->t.t_disp_buffer[i].buffer_index = i;
3056     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3057   }
3058 }
3059 
3060 static void __kmp_free_team_arrays(kmp_team_t *team) {
3061   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3062   int i;
3063   for (i = 0; i < team->t.t_max_nproc; ++i) {
3064     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3065       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3066       team->t.t_dispatch[i].th_disp_buffer = NULL;
3067     }
3068   }
3069 #if KMP_USE_HIER_SCHED
3070   __kmp_dispatch_free_hierarchies(team);
3071 #endif
3072   __kmp_free(team->t.t_threads);
3073   __kmp_free(team->t.t_disp_buffer);
3074   __kmp_free(team->t.t_dispatch);
3075   __kmp_free(team->t.t_implicit_task_taskdata);
3076   team->t.t_threads = NULL;
3077   team->t.t_disp_buffer = NULL;
3078   team->t.t_dispatch = NULL;
3079   team->t.t_implicit_task_taskdata = 0;
3080 }
3081 
3082 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3083   kmp_info_t **oldThreads = team->t.t_threads;
3084 
3085   __kmp_free(team->t.t_disp_buffer);
3086   __kmp_free(team->t.t_dispatch);
3087   __kmp_free(team->t.t_implicit_task_taskdata);
3088   __kmp_allocate_team_arrays(team, max_nth);
3089 
3090   KMP_MEMCPY(team->t.t_threads, oldThreads,
3091              team->t.t_nproc * sizeof(kmp_info_t *));
3092 
3093   __kmp_free(oldThreads);
3094 }
3095 
3096 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3097 
3098   kmp_r_sched_t r_sched =
3099       __kmp_get_schedule_global(); // get current state of scheduling globals
3100 
3101   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3102 
3103   kmp_internal_control_t g_icvs = {
3104     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3105     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3106     // adjustment of threads (per thread)
3107     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3108     // whether blocktime is explicitly set
3109     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3110 #if KMP_USE_MONITOR
3111     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3112 // intervals
3113 #endif
3114     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3115     // next parallel region (per thread)
3116     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3117     __kmp_cg_max_nth, // int thread_limit;
3118     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3119     // for max_active_levels
3120     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3121     // {sched,chunk} pair
3122     __kmp_nested_proc_bind.bind_types[0],
3123     __kmp_default_device,
3124     NULL // struct kmp_internal_control *next;
3125   };
3126 
3127   return g_icvs;
3128 }
3129 
3130 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3131 
3132   kmp_internal_control_t gx_icvs;
3133   gx_icvs.serial_nesting_level =
3134       0; // probably =team->t.t_serial like in save_inter_controls
3135   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3136   gx_icvs.next = NULL;
3137 
3138   return gx_icvs;
3139 }
3140 
3141 static void __kmp_initialize_root(kmp_root_t *root) {
3142   int f;
3143   kmp_team_t *root_team;
3144   kmp_team_t *hot_team;
3145   int hot_team_max_nth;
3146   kmp_r_sched_t r_sched =
3147       __kmp_get_schedule_global(); // get current state of scheduling globals
3148   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3149   KMP_DEBUG_ASSERT(root);
3150   KMP_ASSERT(!root->r.r_begin);
3151 
3152   /* setup the root state structure */
3153   __kmp_init_lock(&root->r.r_begin_lock);
3154   root->r.r_begin = FALSE;
3155   root->r.r_active = FALSE;
3156   root->r.r_in_parallel = 0;
3157   root->r.r_blocktime = __kmp_dflt_blocktime;
3158 
3159   /* setup the root team for this task */
3160   /* allocate the root team structure */
3161   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163   root_team =
3164       __kmp_allocate_team(root,
3165                           1, // new_nproc
3166                           1, // max_nproc
3167 #if OMPT_SUPPORT
3168                           ompt_data_none, // root parallel id
3169 #endif
3170                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3171                           0 // argc
3172                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3173                           );
3174 #if USE_DEBUGGER
3175   // Non-NULL value should be assigned to make the debugger display the root
3176   // team.
3177   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3178 #endif
3179 
3180   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3181 
3182   root->r.r_root_team = root_team;
3183   root_team->t.t_control_stack_top = NULL;
3184 
3185   /* initialize root team */
3186   root_team->t.t_threads[0] = NULL;
3187   root_team->t.t_nproc = 1;
3188   root_team->t.t_serialized = 1;
3189   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3190   root_team->t.t_sched.sched = r_sched.sched;
3191   KA_TRACE(
3192       20,
3193       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3194        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3195 
3196   /* setup the  hot team for this task */
3197   /* allocate the hot team structure */
3198   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3199 
3200   hot_team =
3201       __kmp_allocate_team(root,
3202                           1, // new_nproc
3203                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3204 #if OMPT_SUPPORT
3205                           ompt_data_none, // root parallel id
3206 #endif
3207                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3208                           0 // argc
3209                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3210                           );
3211   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3212 
3213   root->r.r_hot_team = hot_team;
3214   root_team->t.t_control_stack_top = NULL;
3215 
3216   /* first-time initialization */
3217   hot_team->t.t_parent = root_team;
3218 
3219   /* initialize hot team */
3220   hot_team_max_nth = hot_team->t.t_max_nproc;
3221   for (f = 0; f < hot_team_max_nth; ++f) {
3222     hot_team->t.t_threads[f] = NULL;
3223   }
3224   hot_team->t.t_nproc = 1;
3225   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3226   hot_team->t.t_sched.sched = r_sched.sched;
3227   hot_team->t.t_size_changed = 0;
3228 }
3229 
3230 #ifdef KMP_DEBUG
3231 
3232 typedef struct kmp_team_list_item {
3233   kmp_team_p const *entry;
3234   struct kmp_team_list_item *next;
3235 } kmp_team_list_item_t;
3236 typedef kmp_team_list_item_t *kmp_team_list_t;
3237 
3238 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3239     kmp_team_list_t list, // List of teams.
3240     kmp_team_p const *team // Team to add.
3241     ) {
3242 
3243   // List must terminate with item where both entry and next are NULL.
3244   // Team is added to the list only once.
3245   // List is sorted in ascending order by team id.
3246   // Team id is *not* a key.
3247 
3248   kmp_team_list_t l;
3249 
3250   KMP_DEBUG_ASSERT(list != NULL);
3251   if (team == NULL) {
3252     return;
3253   }
3254 
3255   __kmp_print_structure_team_accum(list, team->t.t_parent);
3256   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3257 
3258   // Search list for the team.
3259   l = list;
3260   while (l->next != NULL && l->entry != team) {
3261     l = l->next;
3262   }
3263   if (l->next != NULL) {
3264     return; // Team has been added before, exit.
3265   }
3266 
3267   // Team is not found. Search list again for insertion point.
3268   l = list;
3269   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3270     l = l->next;
3271   }
3272 
3273   // Insert team.
3274   {
3275     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3276         sizeof(kmp_team_list_item_t));
3277     *item = *l;
3278     l->entry = team;
3279     l->next = item;
3280   }
3281 }
3282 
3283 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3284 
3285                                        ) {
3286   __kmp_printf("%s", title);
3287   if (team != NULL) {
3288     __kmp_printf("%2x %p\n", team->t.t_id, team);
3289   } else {
3290     __kmp_printf(" - (nil)\n");
3291   }
3292 }
3293 
3294 static void __kmp_print_structure_thread(char const *title,
3295                                          kmp_info_p const *thread) {
3296   __kmp_printf("%s", title);
3297   if (thread != NULL) {
3298     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3299   } else {
3300     __kmp_printf(" - (nil)\n");
3301   }
3302 }
3303 
3304 void __kmp_print_structure(void) {
3305 
3306   kmp_team_list_t list;
3307 
3308   // Initialize list of teams.
3309   list =
3310       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3311   list->entry = NULL;
3312   list->next = NULL;
3313 
3314   __kmp_printf("\n------------------------------\nGlobal Thread "
3315                "Table\n------------------------------\n");
3316   {
3317     int gtid;
3318     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3319       __kmp_printf("%2d", gtid);
3320       if (__kmp_threads != NULL) {
3321         __kmp_printf(" %p", __kmp_threads[gtid]);
3322       }
3323       if (__kmp_root != NULL) {
3324         __kmp_printf(" %p", __kmp_root[gtid]);
3325       }
3326       __kmp_printf("\n");
3327     }
3328   }
3329 
3330   // Print out __kmp_threads array.
3331   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3332                "----------\n");
3333   if (__kmp_threads != NULL) {
3334     int gtid;
3335     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3336       kmp_info_t const *thread = __kmp_threads[gtid];
3337       if (thread != NULL) {
3338         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3339         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3340         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3341         __kmp_print_structure_team("    Serial Team:  ",
3342                                    thread->th.th_serial_team);
3343         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3344         __kmp_print_structure_thread("    Master:       ",
3345                                      thread->th.th_team_master);
3346         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3347         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3348         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3349         __kmp_print_structure_thread("    Next in pool: ",
3350                                      thread->th.th_next_pool);
3351         __kmp_printf("\n");
3352         __kmp_print_structure_team_accum(list, thread->th.th_team);
3353         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3354       }
3355     }
3356   } else {
3357     __kmp_printf("Threads array is not allocated.\n");
3358   }
3359 
3360   // Print out __kmp_root array.
3361   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3362                "--------\n");
3363   if (__kmp_root != NULL) {
3364     int gtid;
3365     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3366       kmp_root_t const *root = __kmp_root[gtid];
3367       if (root != NULL) {
3368         __kmp_printf("GTID %2d %p:\n", gtid, root);
3369         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3370         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3371         __kmp_print_structure_thread("    Uber Thread:  ",
3372                                      root->r.r_uber_thread);
3373         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3374         __kmp_printf("    In Parallel:  %2d\n",
3375                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3376         __kmp_printf("\n");
3377         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3378         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3379       }
3380     }
3381   } else {
3382     __kmp_printf("Ubers array is not allocated.\n");
3383   }
3384 
3385   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3386                "--------\n");
3387   while (list->next != NULL) {
3388     kmp_team_p const *team = list->entry;
3389     int i;
3390     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3391     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3392     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3393     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3394     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3395     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3396     for (i = 0; i < team->t.t_nproc; ++i) {
3397       __kmp_printf("    Thread %2d:      ", i);
3398       __kmp_print_structure_thread("", team->t.t_threads[i]);
3399     }
3400     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3401     __kmp_printf("\n");
3402     list = list->next;
3403   }
3404 
3405   // Print out __kmp_thread_pool and __kmp_team_pool.
3406   __kmp_printf("\n------------------------------\nPools\n----------------------"
3407                "--------\n");
3408   __kmp_print_structure_thread("Thread pool:          ",
3409                                CCAST(kmp_info_t *, __kmp_thread_pool));
3410   __kmp_print_structure_team("Team pool:            ",
3411                              CCAST(kmp_team_t *, __kmp_team_pool));
3412   __kmp_printf("\n");
3413 
3414   // Free team list.
3415   while (list != NULL) {
3416     kmp_team_list_item_t *item = list;
3417     list = list->next;
3418     KMP_INTERNAL_FREE(item);
3419   }
3420 }
3421 
3422 #endif
3423 
3424 //---------------------------------------------------------------------------
3425 //  Stuff for per-thread fast random number generator
3426 //  Table of primes
3427 static const unsigned __kmp_primes[] = {
3428     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3429     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3430     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3431     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3432     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3433     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3434     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3435     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3436     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3437     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3438     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3439 
3440 //---------------------------------------------------------------------------
3441 //  __kmp_get_random: Get a random number using a linear congruential method.
3442 unsigned short __kmp_get_random(kmp_info_t *thread) {
3443   unsigned x = thread->th.th_x;
3444   unsigned short r = x >> 16;
3445 
3446   thread->th.th_x = x * thread->th.th_a + 1;
3447 
3448   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3449                 thread->th.th_info.ds.ds_tid, r));
3450 
3451   return r;
3452 }
3453 //--------------------------------------------------------
3454 // __kmp_init_random: Initialize a random number generator
3455 void __kmp_init_random(kmp_info_t *thread) {
3456   unsigned seed = thread->th.th_info.ds.ds_tid;
3457 
3458   thread->th.th_a =
3459       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3460   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3461   KA_TRACE(30,
3462            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3463 }
3464 
3465 #if KMP_OS_WINDOWS
3466 /* reclaim array entries for root threads that are already dead, returns number
3467  * reclaimed */
3468 static int __kmp_reclaim_dead_roots(void) {
3469   int i, r = 0;
3470 
3471   for (i = 0; i < __kmp_threads_capacity; ++i) {
3472     if (KMP_UBER_GTID(i) &&
3473         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3474         !__kmp_root[i]
3475              ->r.r_active) { // AC: reclaim only roots died in non-active state
3476       r += __kmp_unregister_root_other_thread(i);
3477     }
3478   }
3479   return r;
3480 }
3481 #endif
3482 
3483 /* This function attempts to create free entries in __kmp_threads and
3484    __kmp_root, and returns the number of free entries generated.
3485 
3486    For Windows* OS static library, the first mechanism used is to reclaim array
3487    entries for root threads that are already dead.
3488 
3489    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3490    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3491    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3492    threadprivate cache array has been created. Synchronization with
3493    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3494 
3495    After any dead root reclamation, if the clipping value allows array expansion
3496    to result in the generation of a total of nNeed free slots, the function does
3497    that expansion. If not, nothing is done beyond the possible initial root
3498    thread reclamation.
3499 
3500    If any argument is negative, the behavior is undefined. */
3501 static int __kmp_expand_threads(int nNeed) {
3502   int added = 0;
3503   int minimumRequiredCapacity;
3504   int newCapacity;
3505   kmp_info_t **newThreads;
3506   kmp_root_t **newRoot;
3507 
3508 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3509 // resizing __kmp_threads does not need additional protection if foreign
3510 // threads are present
3511 
3512 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3513   /* only for Windows static library */
3514   /* reclaim array entries for root threads that are already dead */
3515   added = __kmp_reclaim_dead_roots();
3516 
3517   if (nNeed) {
3518     nNeed -= added;
3519     if (nNeed < 0)
3520       nNeed = 0;
3521   }
3522 #endif
3523   if (nNeed <= 0)
3524     return added;
3525 
3526   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3527   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3528   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3529   // > __kmp_max_nth in one of two ways:
3530   //
3531   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3532   //    may not be reused by another thread, so we may need to increase
3533   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3534   //
3535   // 2) New foreign root(s) are encountered.  We always register new foreign
3536   //    roots. This may cause a smaller # of threads to be allocated at
3537   //    subsequent parallel regions, but the worker threads hang around (and
3538   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3539   //
3540   // Anyway, that is the reason for moving the check to see if
3541   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3542   // instead of having it performed here. -BB
3543 
3544   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3545 
3546   /* compute expansion headroom to check if we can expand */
3547   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3548     /* possible expansion too small -- give up */
3549     return added;
3550   }
3551   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3552 
3553   newCapacity = __kmp_threads_capacity;
3554   do {
3555     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3556                                                           : __kmp_sys_max_nth;
3557   } while (newCapacity < minimumRequiredCapacity);
3558   newThreads = (kmp_info_t **)__kmp_allocate(
3559       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3560   newRoot =
3561       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3562   KMP_MEMCPY(newThreads, __kmp_threads,
3563              __kmp_threads_capacity * sizeof(kmp_info_t *));
3564   KMP_MEMCPY(newRoot, __kmp_root,
3565              __kmp_threads_capacity * sizeof(kmp_root_t *));
3566 
3567   kmp_info_t **temp_threads = __kmp_threads;
3568   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3569   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3570   __kmp_free(temp_threads);
3571   added += newCapacity - __kmp_threads_capacity;
3572   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3573 
3574   if (newCapacity > __kmp_tp_capacity) {
3575     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3576     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3577       __kmp_threadprivate_resize_cache(newCapacity);
3578     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3579       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3580     }
3581     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3582   }
3583 
3584   return added;
3585 }
3586 
3587 /* Register the current thread as a root thread and obtain our gtid. We must
3588    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3589    thread that calls from __kmp_do_serial_initialize() */
3590 int __kmp_register_root(int initial_thread) {
3591   kmp_info_t *root_thread;
3592   kmp_root_t *root;
3593   int gtid;
3594   int capacity;
3595   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3596   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3597   KMP_MB();
3598 
3599   /* 2007-03-02:
3600      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3601      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3602      work as expected -- it may return false (that means there is at least one
3603      empty slot in __kmp_threads array), but it is possible the only free slot
3604      is #0, which is reserved for initial thread and so cannot be used for this
3605      one. Following code workarounds this bug.
3606 
3607      However, right solution seems to be not reserving slot #0 for initial
3608      thread because:
3609      (1) there is no magic in slot #0,
3610      (2) we cannot detect initial thread reliably (the first thread which does
3611         serial initialization may be not a real initial thread).
3612   */
3613   capacity = __kmp_threads_capacity;
3614   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3615     --capacity;
3616   }
3617 
3618   /* see if there are too many threads */
3619   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3620     if (__kmp_tp_cached) {
3621       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3622                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3623                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3624     } else {
3625       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3626                   __kmp_msg_null);
3627     }
3628   }
3629 
3630   /* find an available thread slot */
3631   /* Don't reassign the zero slot since we need that to only be used by initial
3632      thread */
3633   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3634        gtid++)
3635     ;
3636   KA_TRACE(1,
3637            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3638   KMP_ASSERT(gtid < __kmp_threads_capacity);
3639 
3640   /* update global accounting */
3641   __kmp_all_nth++;
3642   TCW_4(__kmp_nth, __kmp_nth + 1);
3643 
3644   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3645   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3646   if (__kmp_adjust_gtid_mode) {
3647     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3648       if (TCR_4(__kmp_gtid_mode) != 2) {
3649         TCW_4(__kmp_gtid_mode, 2);
3650       }
3651     } else {
3652       if (TCR_4(__kmp_gtid_mode) != 1) {
3653         TCW_4(__kmp_gtid_mode, 1);
3654       }
3655     }
3656   }
3657 
3658 #ifdef KMP_ADJUST_BLOCKTIME
3659   /* Adjust blocktime to zero if necessary            */
3660   /* Middle initialization might not have occurred yet */
3661   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3662     if (__kmp_nth > __kmp_avail_proc) {
3663       __kmp_zero_bt = TRUE;
3664     }
3665   }
3666 #endif /* KMP_ADJUST_BLOCKTIME */
3667 
3668   /* setup this new hierarchy */
3669   if (!(root = __kmp_root[gtid])) {
3670     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3671     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3672   }
3673 
3674 #if KMP_STATS_ENABLED
3675   // Initialize stats as soon as possible (right after gtid assignment).
3676   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3677   __kmp_stats_thread_ptr->startLife();
3678   KMP_SET_THREAD_STATE(SERIAL_REGION);
3679   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3680 #endif
3681   __kmp_initialize_root(root);
3682 
3683   /* setup new root thread structure */
3684   if (root->r.r_uber_thread) {
3685     root_thread = root->r.r_uber_thread;
3686   } else {
3687     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3688     if (__kmp_storage_map) {
3689       __kmp_print_thread_storage_map(root_thread, gtid);
3690     }
3691     root_thread->th.th_info.ds.ds_gtid = gtid;
3692 #if OMPT_SUPPORT
3693     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3694 #endif
3695     root_thread->th.th_root = root;
3696     if (__kmp_env_consistency_check) {
3697       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3698     }
3699 #if USE_FAST_MEMORY
3700     __kmp_initialize_fast_memory(root_thread);
3701 #endif /* USE_FAST_MEMORY */
3702 
3703 #if KMP_USE_BGET
3704     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3705     __kmp_initialize_bget(root_thread);
3706 #endif
3707     __kmp_init_random(root_thread); // Initialize random number generator
3708   }
3709 
3710   /* setup the serial team held in reserve by the root thread */
3711   if (!root_thread->th.th_serial_team) {
3712     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3713     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3714     root_thread->th.th_serial_team = __kmp_allocate_team(
3715         root, 1, 1,
3716 #if OMPT_SUPPORT
3717         ompt_data_none, // root parallel id
3718 #endif
3719         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3720   }
3721   KMP_ASSERT(root_thread->th.th_serial_team);
3722   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3723                 root_thread->th.th_serial_team));
3724 
3725   /* drop root_thread into place */
3726   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3727 
3728   root->r.r_root_team->t.t_threads[0] = root_thread;
3729   root->r.r_hot_team->t.t_threads[0] = root_thread;
3730   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3731   // AC: the team created in reserve, not for execution (it is unused for now).
3732   root_thread->th.th_serial_team->t.t_serialized = 0;
3733   root->r.r_uber_thread = root_thread;
3734 
3735   /* initialize the thread, get it ready to go */
3736   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3737   TCW_4(__kmp_init_gtid, TRUE);
3738 
3739   /* prepare the master thread for get_gtid() */
3740   __kmp_gtid_set_specific(gtid);
3741 
3742 #if USE_ITT_BUILD
3743   __kmp_itt_thread_name(gtid);
3744 #endif /* USE_ITT_BUILD */
3745 
3746 #ifdef KMP_TDATA_GTID
3747   __kmp_gtid = gtid;
3748 #endif
3749   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3750   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3751 
3752   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3753                 "plain=%u\n",
3754                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3755                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3756                 KMP_INIT_BARRIER_STATE));
3757   { // Initialize barrier data.
3758     int b;
3759     for (b = 0; b < bs_last_barrier; ++b) {
3760       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3761 #if USE_DEBUGGER
3762       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3763 #endif
3764     }
3765   }
3766   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3767                    KMP_INIT_BARRIER_STATE);
3768 
3769 #if KMP_AFFINITY_SUPPORTED
3770   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3771   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3772   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3773   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3774   if (TCR_4(__kmp_init_middle)) {
3775     __kmp_affinity_set_init_mask(gtid, TRUE);
3776   }
3777 #endif /* KMP_AFFINITY_SUPPORTED */
3778   root_thread->th.th_def_allocator = __kmp_def_allocator;
3779   root_thread->th.th_prev_level = 0;
3780   root_thread->th.th_prev_num_threads = 1;
3781 
3782   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3783   tmp->cg_root = root_thread;
3784   tmp->cg_thread_limit = __kmp_cg_max_nth;
3785   tmp->cg_nthreads = 1;
3786   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3787                  " cg_nthreads init to 1\n",
3788                  root_thread, tmp));
3789   tmp->up = NULL;
3790   root_thread->th.th_cg_roots = tmp;
3791 
3792   __kmp_root_counter++;
3793 
3794 #if OMPT_SUPPORT
3795   if (!initial_thread && ompt_enabled.enabled) {
3796 
3797     kmp_info_t *root_thread = ompt_get_thread();
3798 
3799     ompt_set_thread_state(root_thread, ompt_state_overhead);
3800 
3801     if (ompt_enabled.ompt_callback_thread_begin) {
3802       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3803           ompt_thread_initial, __ompt_get_thread_data_internal());
3804     }
3805     ompt_data_t *task_data;
3806     ompt_data_t *parallel_data;
3807     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3808     if (ompt_enabled.ompt_callback_implicit_task) {
3809       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3810           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3811     }
3812 
3813     ompt_set_thread_state(root_thread, ompt_state_work_serial);
3814   }
3815 #endif
3816 
3817   KMP_MB();
3818   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3819 
3820   return gtid;
3821 }
3822 
3823 #if KMP_NESTED_HOT_TEAMS
3824 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3825                                 const int max_level) {
3826   int i, n, nth;
3827   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3828   if (!hot_teams || !hot_teams[level].hot_team) {
3829     return 0;
3830   }
3831   KMP_DEBUG_ASSERT(level < max_level);
3832   kmp_team_t *team = hot_teams[level].hot_team;
3833   nth = hot_teams[level].hot_team_nth;
3834   n = nth - 1; // master is not freed
3835   if (level < max_level - 1) {
3836     for (i = 0; i < nth; ++i) {
3837       kmp_info_t *th = team->t.t_threads[i];
3838       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3839       if (i > 0 && th->th.th_hot_teams) {
3840         __kmp_free(th->th.th_hot_teams);
3841         th->th.th_hot_teams = NULL;
3842       }
3843     }
3844   }
3845   __kmp_free_team(root, team, NULL);
3846   return n;
3847 }
3848 #endif
3849 
3850 // Resets a root thread and clear its root and hot teams.
3851 // Returns the number of __kmp_threads entries directly and indirectly freed.
3852 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3853   kmp_team_t *root_team = root->r.r_root_team;
3854   kmp_team_t *hot_team = root->r.r_hot_team;
3855   int n = hot_team->t.t_nproc;
3856   int i;
3857 
3858   KMP_DEBUG_ASSERT(!root->r.r_active);
3859 
3860   root->r.r_root_team = NULL;
3861   root->r.r_hot_team = NULL;
3862   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3863   // before call to __kmp_free_team().
3864   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3865 #if KMP_NESTED_HOT_TEAMS
3866   if (__kmp_hot_teams_max_level >
3867       0) { // need to free nested hot teams and their threads if any
3868     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3869       kmp_info_t *th = hot_team->t.t_threads[i];
3870       if (__kmp_hot_teams_max_level > 1) {
3871         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3872       }
3873       if (th->th.th_hot_teams) {
3874         __kmp_free(th->th.th_hot_teams);
3875         th->th.th_hot_teams = NULL;
3876       }
3877     }
3878   }
3879 #endif
3880   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3881 
3882   // Before we can reap the thread, we need to make certain that all other
3883   // threads in the teams that had this root as ancestor have stopped trying to
3884   // steal tasks.
3885   if (__kmp_tasking_mode != tskm_immediate_exec) {
3886     __kmp_wait_to_unref_task_teams();
3887   }
3888 
3889 #if KMP_OS_WINDOWS
3890   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3891   KA_TRACE(
3892       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3893            "\n",
3894            (LPVOID) & (root->r.r_uber_thread->th),
3895            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3896   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3897 #endif /* KMP_OS_WINDOWS */
3898 
3899 #if OMPT_SUPPORT
3900   ompt_data_t *task_data;
3901   ompt_data_t *parallel_data;
3902   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3903   if (ompt_enabled.ompt_callback_implicit_task) {
3904     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3905         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3906   }
3907   if (ompt_enabled.ompt_callback_thread_end) {
3908     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3909         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3910   }
3911 #endif
3912 
3913   TCW_4(__kmp_nth,
3914         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3915   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3916   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3917                  " to %d\n",
3918                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3919                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3920   if (i == 1) {
3921     // need to free contention group structure
3922     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3923                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
3924     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3925     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3926     root->r.r_uber_thread->th.th_cg_roots = NULL;
3927   }
3928   __kmp_reap_thread(root->r.r_uber_thread, 1);
3929 
3930   // We canot put root thread to __kmp_thread_pool, so we have to reap it
3931   // instead of freeing.
3932   root->r.r_uber_thread = NULL;
3933   /* mark root as no longer in use */
3934   root->r.r_begin = FALSE;
3935 
3936   return n;
3937 }
3938 
3939 void __kmp_unregister_root_current_thread(int gtid) {
3940   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3941   /* this lock should be ok, since unregister_root_current_thread is never
3942      called during an abort, only during a normal close. furthermore, if you
3943      have the forkjoin lock, you should never try to get the initz lock */
3944   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3945   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3946     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3947                   "exiting T#%d\n",
3948                   gtid));
3949     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3950     return;
3951   }
3952   kmp_root_t *root = __kmp_root[gtid];
3953 
3954   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3955   KMP_ASSERT(KMP_UBER_GTID(gtid));
3956   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3957   KMP_ASSERT(root->r.r_active == FALSE);
3958 
3959   KMP_MB();
3960 
3961   kmp_info_t *thread = __kmp_threads[gtid];
3962   kmp_team_t *team = thread->th.th_team;
3963   kmp_task_team_t *task_team = thread->th.th_task_team;
3964 
3965   // we need to wait for the proxy tasks before finishing the thread
3966   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3967 #if OMPT_SUPPORT
3968     // the runtime is shutting down so we won't report any events
3969     thread->th.ompt_thread_info.state = ompt_state_undefined;
3970 #endif
3971     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3972   }
3973 
3974   __kmp_reset_root(gtid, root);
3975 
3976   /* free up this thread slot */
3977   __kmp_gtid_set_specific(KMP_GTID_DNE);
3978 #ifdef KMP_TDATA_GTID
3979   __kmp_gtid = KMP_GTID_DNE;
3980 #endif
3981 
3982   KMP_MB();
3983   KC_TRACE(10,
3984            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3985 
3986   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3987 }
3988 
3989 #if KMP_OS_WINDOWS
3990 /* __kmp_forkjoin_lock must be already held
3991    Unregisters a root thread that is not the current thread.  Returns the number
3992    of __kmp_threads entries freed as a result. */
3993 static int __kmp_unregister_root_other_thread(int gtid) {
3994   kmp_root_t *root = __kmp_root[gtid];
3995   int r;
3996 
3997   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3998   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3999   KMP_ASSERT(KMP_UBER_GTID(gtid));
4000   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4001   KMP_ASSERT(root->r.r_active == FALSE);
4002 
4003   r = __kmp_reset_root(gtid, root);
4004   KC_TRACE(10,
4005            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4006   return r;
4007 }
4008 #endif
4009 
4010 #if KMP_DEBUG
4011 void __kmp_task_info() {
4012 
4013   kmp_int32 gtid = __kmp_entry_gtid();
4014   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4015   kmp_info_t *this_thr = __kmp_threads[gtid];
4016   kmp_team_t *steam = this_thr->th.th_serial_team;
4017   kmp_team_t *team = this_thr->th.th_team;
4018 
4019   __kmp_printf(
4020       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4021       "ptask=%p\n",
4022       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4023       team->t.t_implicit_task_taskdata[tid].td_parent);
4024 }
4025 #endif // KMP_DEBUG
4026 
4027 /* TODO optimize with one big memclr, take out what isn't needed, split
4028    responsibility to workers as much as possible, and delay initialization of
4029    features as much as possible  */
4030 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4031                                   int tid, int gtid) {
4032   /* this_thr->th.th_info.ds.ds_gtid is setup in
4033      kmp_allocate_thread/create_worker.
4034      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4035   kmp_info_t *master = team->t.t_threads[0];
4036   KMP_DEBUG_ASSERT(this_thr != NULL);
4037   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4038   KMP_DEBUG_ASSERT(team);
4039   KMP_DEBUG_ASSERT(team->t.t_threads);
4040   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4041   KMP_DEBUG_ASSERT(master);
4042   KMP_DEBUG_ASSERT(master->th.th_root);
4043 
4044   KMP_MB();
4045 
4046   TCW_SYNC_PTR(this_thr->th.th_team, team);
4047 
4048   this_thr->th.th_info.ds.ds_tid = tid;
4049   this_thr->th.th_set_nproc = 0;
4050   if (__kmp_tasking_mode != tskm_immediate_exec)
4051     // When tasking is possible, threads are not safe to reap until they are
4052     // done tasking; this will be set when tasking code is exited in wait
4053     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4054   else // no tasking --> always safe to reap
4055     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4056   this_thr->th.th_set_proc_bind = proc_bind_default;
4057 #if KMP_AFFINITY_SUPPORTED
4058   this_thr->th.th_new_place = this_thr->th.th_current_place;
4059 #endif
4060   this_thr->th.th_root = master->th.th_root;
4061 
4062   /* setup the thread's cache of the team structure */
4063   this_thr->th.th_team_nproc = team->t.t_nproc;
4064   this_thr->th.th_team_master = master;
4065   this_thr->th.th_team_serialized = team->t.t_serialized;
4066   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4067 
4068   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4069 
4070   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4071                 tid, gtid, this_thr, this_thr->th.th_current_task));
4072 
4073   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4074                            team, tid, TRUE);
4075 
4076   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4077                 tid, gtid, this_thr, this_thr->th.th_current_task));
4078   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4079   // __kmp_initialize_team()?
4080 
4081   /* TODO no worksharing in speculative threads */
4082   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4083 
4084   this_thr->th.th_local.this_construct = 0;
4085 
4086   if (!this_thr->th.th_pri_common) {
4087     this_thr->th.th_pri_common =
4088         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4089     if (__kmp_storage_map) {
4090       __kmp_print_storage_map_gtid(
4091           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4092           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4093     }
4094     this_thr->th.th_pri_head = NULL;
4095   }
4096 
4097   if (this_thr != master && // Master's CG root is initialized elsewhere
4098       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4099     // Make new thread's CG root same as master's
4100     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4101     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4102     if (tmp) {
4103       // worker changes CG, need to check if old CG should be freed
4104       int i = tmp->cg_nthreads--;
4105       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4106                      " on node %p of thread %p to %d\n",
4107                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4108       if (i == 1) {
4109         __kmp_free(tmp); // last thread left CG --> free it
4110       }
4111     }
4112     this_thr->th.th_cg_roots = master->th.th_cg_roots;
4113     // Increment new thread's CG root's counter to add the new thread
4114     this_thr->th.th_cg_roots->cg_nthreads++;
4115     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4116                    " node %p of thread %p to %d\n",
4117                    this_thr, this_thr->th.th_cg_roots,
4118                    this_thr->th.th_cg_roots->cg_root,
4119                    this_thr->th.th_cg_roots->cg_nthreads));
4120     this_thr->th.th_current_task->td_icvs.thread_limit =
4121         this_thr->th.th_cg_roots->cg_thread_limit;
4122   }
4123 
4124   /* Initialize dynamic dispatch */
4125   {
4126     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4127     // Use team max_nproc since this will never change for the team.
4128     size_t disp_size =
4129         sizeof(dispatch_private_info_t) *
4130         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4131     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4132                   team->t.t_max_nproc));
4133     KMP_ASSERT(dispatch);
4134     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4135     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4136 
4137     dispatch->th_disp_index = 0;
4138     dispatch->th_doacross_buf_idx = 0;
4139     if (!dispatch->th_disp_buffer) {
4140       dispatch->th_disp_buffer =
4141           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4142 
4143       if (__kmp_storage_map) {
4144         __kmp_print_storage_map_gtid(
4145             gtid, &dispatch->th_disp_buffer[0],
4146             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4147                                           ? 1
4148                                           : __kmp_dispatch_num_buffers],
4149             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4150                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4151             gtid, team->t.t_id, gtid);
4152       }
4153     } else {
4154       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4155     }
4156 
4157     dispatch->th_dispatch_pr_current = 0;
4158     dispatch->th_dispatch_sh_current = 0;
4159 
4160     dispatch->th_deo_fcn = 0; /* ORDERED     */
4161     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4162   }
4163 
4164   this_thr->th.th_next_pool = NULL;
4165 
4166   if (!this_thr->th.th_task_state_memo_stack) {
4167     size_t i;
4168     this_thr->th.th_task_state_memo_stack =
4169         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4170     this_thr->th.th_task_state_top = 0;
4171     this_thr->th.th_task_state_stack_sz = 4;
4172     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4173          ++i) // zero init the stack
4174       this_thr->th.th_task_state_memo_stack[i] = 0;
4175   }
4176 
4177   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4178   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4179 
4180   KMP_MB();
4181 }
4182 
4183 /* allocate a new thread for the requesting team. this is only called from
4184    within a forkjoin critical section. we will first try to get an available
4185    thread from the thread pool. if none is available, we will fork a new one
4186    assuming we are able to create a new one. this should be assured, as the
4187    caller should check on this first. */
4188 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4189                                   int new_tid) {
4190   kmp_team_t *serial_team;
4191   kmp_info_t *new_thr;
4192   int new_gtid;
4193 
4194   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4195   KMP_DEBUG_ASSERT(root && team);
4196 #if !KMP_NESTED_HOT_TEAMS
4197   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4198 #endif
4199   KMP_MB();
4200 
4201   /* first, try to get one from the thread pool */
4202   if (__kmp_thread_pool) {
4203     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4204     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4205     if (new_thr == __kmp_thread_pool_insert_pt) {
4206       __kmp_thread_pool_insert_pt = NULL;
4207     }
4208     TCW_4(new_thr->th.th_in_pool, FALSE);
4209     __kmp_suspend_initialize_thread(new_thr);
4210     __kmp_lock_suspend_mx(new_thr);
4211     if (new_thr->th.th_active_in_pool == TRUE) {
4212       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4213       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4214       new_thr->th.th_active_in_pool = FALSE;
4215     }
4216     __kmp_unlock_suspend_mx(new_thr);
4217 
4218     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4219                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4220     KMP_ASSERT(!new_thr->th.th_team);
4221     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4222 
4223     /* setup the thread structure */
4224     __kmp_initialize_info(new_thr, team, new_tid,
4225                           new_thr->th.th_info.ds.ds_gtid);
4226     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4227 
4228     TCW_4(__kmp_nth, __kmp_nth + 1);
4229 
4230     new_thr->th.th_task_state = 0;
4231     new_thr->th.th_task_state_top = 0;
4232     new_thr->th.th_task_state_stack_sz = 4;
4233 
4234 #ifdef KMP_ADJUST_BLOCKTIME
4235     /* Adjust blocktime back to zero if necessary */
4236     /* Middle initialization might not have occurred yet */
4237     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4238       if (__kmp_nth > __kmp_avail_proc) {
4239         __kmp_zero_bt = TRUE;
4240       }
4241     }
4242 #endif /* KMP_ADJUST_BLOCKTIME */
4243 
4244 #if KMP_DEBUG
4245     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4246     // KMP_BARRIER_PARENT_FLAG.
4247     int b;
4248     kmp_balign_t *balign = new_thr->th.th_bar;
4249     for (b = 0; b < bs_last_barrier; ++b)
4250       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4251 #endif
4252 
4253     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4254                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4255 
4256     KMP_MB();
4257     return new_thr;
4258   }
4259 
4260   /* no, well fork a new one */
4261   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4262   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4263 
4264 #if KMP_USE_MONITOR
4265   // If this is the first worker thread the RTL is creating, then also
4266   // launch the monitor thread.  We try to do this as early as possible.
4267   if (!TCR_4(__kmp_init_monitor)) {
4268     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4269     if (!TCR_4(__kmp_init_monitor)) {
4270       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4271       TCW_4(__kmp_init_monitor, 1);
4272       __kmp_create_monitor(&__kmp_monitor);
4273       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4274 #if KMP_OS_WINDOWS
4275       // AC: wait until monitor has started. This is a fix for CQ232808.
4276       // The reason is that if the library is loaded/unloaded in a loop with
4277       // small (parallel) work in between, then there is high probability that
4278       // monitor thread started after the library shutdown. At shutdown it is
4279       // too late to cope with the problem, because when the master is in
4280       // DllMain (process detach) the monitor has no chances to start (it is
4281       // blocked), and master has no means to inform the monitor that the
4282       // library has gone, because all the memory which the monitor can access
4283       // is going to be released/reset.
4284       while (TCR_4(__kmp_init_monitor) < 2) {
4285         KMP_YIELD(TRUE);
4286       }
4287       KF_TRACE(10, ("after monitor thread has started\n"));
4288 #endif
4289     }
4290     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4291   }
4292 #endif
4293 
4294   KMP_MB();
4295   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4296     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4297   }
4298 
4299   /* allocate space for it. */
4300   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4301 
4302   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4303 
4304 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4305   // suppress race conditions detection on synchronization flags in debug mode
4306   // this helps to analyze library internals eliminating false positives
4307   __itt_suppress_mark_range(
4308       __itt_suppress_range, __itt_suppress_threading_errors,
4309       &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4310   __itt_suppress_mark_range(
4311       __itt_suppress_range, __itt_suppress_threading_errors,
4312       &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4313 #if KMP_OS_WINDOWS
4314   __itt_suppress_mark_range(
4315       __itt_suppress_range, __itt_suppress_threading_errors,
4316       &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4317 #else
4318   __itt_suppress_mark_range(__itt_suppress_range,
4319                             __itt_suppress_threading_errors,
4320                             &new_thr->th.th_suspend_init_count,
4321                             sizeof(new_thr->th.th_suspend_init_count));
4322 #endif
4323   // TODO: check if we need to also suppress b_arrived flags
4324   __itt_suppress_mark_range(__itt_suppress_range,
4325                             __itt_suppress_threading_errors,
4326                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4327                             sizeof(new_thr->th.th_bar[0].bb.b_go));
4328   __itt_suppress_mark_range(__itt_suppress_range,
4329                             __itt_suppress_threading_errors,
4330                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4331                             sizeof(new_thr->th.th_bar[1].bb.b_go));
4332   __itt_suppress_mark_range(__itt_suppress_range,
4333                             __itt_suppress_threading_errors,
4334                             CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4335                             sizeof(new_thr->th.th_bar[2].bb.b_go));
4336 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4337   if (__kmp_storage_map) {
4338     __kmp_print_thread_storage_map(new_thr, new_gtid);
4339   }
4340 
4341   // add the reserve serialized team, initialized from the team's master thread
4342   {
4343     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4344     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4345     new_thr->th.th_serial_team = serial_team =
4346         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4347 #if OMPT_SUPPORT
4348                                           ompt_data_none, // root parallel id
4349 #endif
4350                                           proc_bind_default, &r_icvs,
4351                                           0 USE_NESTED_HOT_ARG(NULL));
4352   }
4353   KMP_ASSERT(serial_team);
4354   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4355   // execution (it is unused for now).
4356   serial_team->t.t_threads[0] = new_thr;
4357   KF_TRACE(10,
4358            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4359             new_thr));
4360 
4361   /* setup the thread structures */
4362   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4363 
4364 #if USE_FAST_MEMORY
4365   __kmp_initialize_fast_memory(new_thr);
4366 #endif /* USE_FAST_MEMORY */
4367 
4368 #if KMP_USE_BGET
4369   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4370   __kmp_initialize_bget(new_thr);
4371 #endif
4372 
4373   __kmp_init_random(new_thr); // Initialize random number generator
4374 
4375   /* Initialize these only once when thread is grabbed for a team allocation */
4376   KA_TRACE(20,
4377            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4378             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4379 
4380   int b;
4381   kmp_balign_t *balign = new_thr->th.th_bar;
4382   for (b = 0; b < bs_last_barrier; ++b) {
4383     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4384     balign[b].bb.team = NULL;
4385     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4386     balign[b].bb.use_oncore_barrier = 0;
4387   }
4388 
4389   new_thr->th.th_spin_here = FALSE;
4390   new_thr->th.th_next_waiting = 0;
4391 #if KMP_OS_UNIX
4392   new_thr->th.th_blocking = false;
4393 #endif
4394 
4395 #if KMP_AFFINITY_SUPPORTED
4396   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4397   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4398   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4399   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4400 #endif
4401   new_thr->th.th_def_allocator = __kmp_def_allocator;
4402   new_thr->th.th_prev_level = 0;
4403   new_thr->th.th_prev_num_threads = 1;
4404 
4405   TCW_4(new_thr->th.th_in_pool, FALSE);
4406   new_thr->th.th_active_in_pool = FALSE;
4407   TCW_4(new_thr->th.th_active, TRUE);
4408 
4409   /* adjust the global counters */
4410   __kmp_all_nth++;
4411   __kmp_nth++;
4412 
4413   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4414   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4415   if (__kmp_adjust_gtid_mode) {
4416     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4417       if (TCR_4(__kmp_gtid_mode) != 2) {
4418         TCW_4(__kmp_gtid_mode, 2);
4419       }
4420     } else {
4421       if (TCR_4(__kmp_gtid_mode) != 1) {
4422         TCW_4(__kmp_gtid_mode, 1);
4423       }
4424     }
4425   }
4426 
4427 #ifdef KMP_ADJUST_BLOCKTIME
4428   /* Adjust blocktime back to zero if necessary       */
4429   /* Middle initialization might not have occurred yet */
4430   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4431     if (__kmp_nth > __kmp_avail_proc) {
4432       __kmp_zero_bt = TRUE;
4433     }
4434   }
4435 #endif /* KMP_ADJUST_BLOCKTIME */
4436 
4437   /* actually fork it and create the new worker thread */
4438   KF_TRACE(
4439       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4440   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4441   KF_TRACE(10,
4442            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4443 
4444   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4445                 new_gtid));
4446   KMP_MB();
4447   return new_thr;
4448 }
4449 
4450 /* Reinitialize team for reuse.
4451    The hot team code calls this case at every fork barrier, so EPCC barrier
4452    test are extremely sensitive to changes in it, esp. writes to the team
4453    struct, which cause a cache invalidation in all threads.
4454    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4455 static void __kmp_reinitialize_team(kmp_team_t *team,
4456                                     kmp_internal_control_t *new_icvs,
4457                                     ident_t *loc) {
4458   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4459                 team->t.t_threads[0], team));
4460   KMP_DEBUG_ASSERT(team && new_icvs);
4461   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4462   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4463 
4464   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4465   // Copy ICVs to the master thread's implicit taskdata
4466   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4467   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4468 
4469   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4470                 team->t.t_threads[0], team));
4471 }
4472 
4473 /* Initialize the team data structure.
4474    This assumes the t_threads and t_max_nproc are already set.
4475    Also, we don't touch the arguments */
4476 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4477                                   kmp_internal_control_t *new_icvs,
4478                                   ident_t *loc) {
4479   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4480 
4481   /* verify */
4482   KMP_DEBUG_ASSERT(team);
4483   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4484   KMP_DEBUG_ASSERT(team->t.t_threads);
4485   KMP_MB();
4486 
4487   team->t.t_master_tid = 0; /* not needed */
4488   /* team->t.t_master_bar;        not needed */
4489   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4490   team->t.t_nproc = new_nproc;
4491 
4492   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4493   team->t.t_next_pool = NULL;
4494   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4495    * up hot team */
4496 
4497   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4498   team->t.t_invoke = NULL; /* not needed */
4499 
4500   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4501   team->t.t_sched.sched = new_icvs->sched.sched;
4502 
4503 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4504   team->t.t_fp_control_saved = FALSE; /* not needed */
4505   team->t.t_x87_fpu_control_word = 0; /* not needed */
4506   team->t.t_mxcsr = 0; /* not needed */
4507 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4508 
4509   team->t.t_construct = 0;
4510 
4511   team->t.t_ordered.dt.t_value = 0;
4512   team->t.t_master_active = FALSE;
4513 
4514 #ifdef KMP_DEBUG
4515   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4516 #endif
4517 #if KMP_OS_WINDOWS
4518   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4519 #endif
4520 
4521   team->t.t_control_stack_top = NULL;
4522 
4523   __kmp_reinitialize_team(team, new_icvs, loc);
4524 
4525   KMP_MB();
4526   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4527 }
4528 
4529 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4530 /* Sets full mask for thread and returns old mask, no changes to structures. */
4531 static void
4532 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4533   if (KMP_AFFINITY_CAPABLE()) {
4534     int status;
4535     if (old_mask != NULL) {
4536       status = __kmp_get_system_affinity(old_mask, TRUE);
4537       int error = errno;
4538       if (status != 0) {
4539         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4540                     __kmp_msg_null);
4541       }
4542     }
4543     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4544   }
4545 }
4546 #endif
4547 
4548 #if KMP_AFFINITY_SUPPORTED
4549 
4550 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4551 // It calculates the worker + master thread's partition based upon the parent
4552 // thread's partition, and binds each worker to a thread in their partition.
4553 // The master thread's partition should already include its current binding.
4554 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4555   // Copy the master thread's place partition to the team struct
4556   kmp_info_t *master_th = team->t.t_threads[0];
4557   KMP_DEBUG_ASSERT(master_th != NULL);
4558   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4559   int first_place = master_th->th.th_first_place;
4560   int last_place = master_th->th.th_last_place;
4561   int masters_place = master_th->th.th_current_place;
4562   team->t.t_first_place = first_place;
4563   team->t.t_last_place = last_place;
4564 
4565   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4566                 "bound to place %d partition = [%d,%d]\n",
4567                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4568                 team->t.t_id, masters_place, first_place, last_place));
4569 
4570   switch (proc_bind) {
4571 
4572   case proc_bind_default:
4573     // serial teams might have the proc_bind policy set to proc_bind_default. It
4574     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4575     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4576     break;
4577 
4578   case proc_bind_master: {
4579     int f;
4580     int n_th = team->t.t_nproc;
4581     for (f = 1; f < n_th; f++) {
4582       kmp_info_t *th = team->t.t_threads[f];
4583       KMP_DEBUG_ASSERT(th != NULL);
4584       th->th.th_first_place = first_place;
4585       th->th.th_last_place = last_place;
4586       th->th.th_new_place = masters_place;
4587       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4588           team->t.t_display_affinity != 1) {
4589         team->t.t_display_affinity = 1;
4590       }
4591 
4592       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4593                      "partition = [%d,%d]\n",
4594                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4595                      f, masters_place, first_place, last_place));
4596     }
4597   } break;
4598 
4599   case proc_bind_close: {
4600     int f;
4601     int n_th = team->t.t_nproc;
4602     int n_places;
4603     if (first_place <= last_place) {
4604       n_places = last_place - first_place + 1;
4605     } else {
4606       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4607     }
4608     if (n_th <= n_places) {
4609       int place = masters_place;
4610       for (f = 1; f < n_th; f++) {
4611         kmp_info_t *th = team->t.t_threads[f];
4612         KMP_DEBUG_ASSERT(th != NULL);
4613 
4614         if (place == last_place) {
4615           place = first_place;
4616         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4617           place = 0;
4618         } else {
4619           place++;
4620         }
4621         th->th.th_first_place = first_place;
4622         th->th.th_last_place = last_place;
4623         th->th.th_new_place = place;
4624         if (__kmp_display_affinity && place != th->th.th_current_place &&
4625             team->t.t_display_affinity != 1) {
4626           team->t.t_display_affinity = 1;
4627         }
4628 
4629         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4630                        "partition = [%d,%d]\n",
4631                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4632                        team->t.t_id, f, place, first_place, last_place));
4633       }
4634     } else {
4635       int S, rem, gap, s_count;
4636       S = n_th / n_places;
4637       s_count = 0;
4638       rem = n_th - (S * n_places);
4639       gap = rem > 0 ? n_places / rem : n_places;
4640       int place = masters_place;
4641       int gap_ct = gap;
4642       for (f = 0; f < n_th; f++) {
4643         kmp_info_t *th = team->t.t_threads[f];
4644         KMP_DEBUG_ASSERT(th != NULL);
4645 
4646         th->th.th_first_place = first_place;
4647         th->th.th_last_place = last_place;
4648         th->th.th_new_place = place;
4649         if (__kmp_display_affinity && place != th->th.th_current_place &&
4650             team->t.t_display_affinity != 1) {
4651           team->t.t_display_affinity = 1;
4652         }
4653         s_count++;
4654 
4655         if ((s_count == S) && rem && (gap_ct == gap)) {
4656           // do nothing, add an extra thread to place on next iteration
4657         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4658           // we added an extra thread to this place; move to next place
4659           if (place == last_place) {
4660             place = first_place;
4661           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4662             place = 0;
4663           } else {
4664             place++;
4665           }
4666           s_count = 0;
4667           gap_ct = 1;
4668           rem--;
4669         } else if (s_count == S) { // place full; don't add extra
4670           if (place == last_place) {
4671             place = first_place;
4672           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4673             place = 0;
4674           } else {
4675             place++;
4676           }
4677           gap_ct++;
4678           s_count = 0;
4679         }
4680 
4681         KA_TRACE(100,
4682                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4683                   "partition = [%d,%d]\n",
4684                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4685                   th->th.th_new_place, first_place, last_place));
4686       }
4687       KMP_DEBUG_ASSERT(place == masters_place);
4688     }
4689   } break;
4690 
4691   case proc_bind_spread: {
4692     int f;
4693     int n_th = team->t.t_nproc;
4694     int n_places;
4695     int thidx;
4696     if (first_place <= last_place) {
4697       n_places = last_place - first_place + 1;
4698     } else {
4699       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4700     }
4701     if (n_th <= n_places) {
4702       int place = -1;
4703 
4704       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4705         int S = n_places / n_th;
4706         int s_count, rem, gap, gap_ct;
4707 
4708         place = masters_place;
4709         rem = n_places - n_th * S;
4710         gap = rem ? n_th / rem : 1;
4711         gap_ct = gap;
4712         thidx = n_th;
4713         if (update_master_only == 1)
4714           thidx = 1;
4715         for (f = 0; f < thidx; f++) {
4716           kmp_info_t *th = team->t.t_threads[f];
4717           KMP_DEBUG_ASSERT(th != NULL);
4718 
4719           th->th.th_first_place = place;
4720           th->th.th_new_place = place;
4721           if (__kmp_display_affinity && place != th->th.th_current_place &&
4722               team->t.t_display_affinity != 1) {
4723             team->t.t_display_affinity = 1;
4724           }
4725           s_count = 1;
4726           while (s_count < S) {
4727             if (place == last_place) {
4728               place = first_place;
4729             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4730               place = 0;
4731             } else {
4732               place++;
4733             }
4734             s_count++;
4735           }
4736           if (rem && (gap_ct == gap)) {
4737             if (place == last_place) {
4738               place = first_place;
4739             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4740               place = 0;
4741             } else {
4742               place++;
4743             }
4744             rem--;
4745             gap_ct = 0;
4746           }
4747           th->th.th_last_place = place;
4748           gap_ct++;
4749 
4750           if (place == last_place) {
4751             place = first_place;
4752           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4753             place = 0;
4754           } else {
4755             place++;
4756           }
4757 
4758           KA_TRACE(100,
4759                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4760                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4761                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4762                     f, th->th.th_new_place, th->th.th_first_place,
4763                     th->th.th_last_place, __kmp_affinity_num_masks));
4764         }
4765       } else {
4766         /* Having uniform space of available computation places I can create
4767            T partitions of round(P/T) size and put threads into the first
4768            place of each partition. */
4769         double current = static_cast<double>(masters_place);
4770         double spacing =
4771             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4772         int first, last;
4773         kmp_info_t *th;
4774 
4775         thidx = n_th + 1;
4776         if (update_master_only == 1)
4777           thidx = 1;
4778         for (f = 0; f < thidx; f++) {
4779           first = static_cast<int>(current);
4780           last = static_cast<int>(current + spacing) - 1;
4781           KMP_DEBUG_ASSERT(last >= first);
4782           if (first >= n_places) {
4783             if (masters_place) {
4784               first -= n_places;
4785               last -= n_places;
4786               if (first == (masters_place + 1)) {
4787                 KMP_DEBUG_ASSERT(f == n_th);
4788                 first--;
4789               }
4790               if (last == masters_place) {
4791                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4792                 last--;
4793               }
4794             } else {
4795               KMP_DEBUG_ASSERT(f == n_th);
4796               first = 0;
4797               last = 0;
4798             }
4799           }
4800           if (last >= n_places) {
4801             last = (n_places - 1);
4802           }
4803           place = first;
4804           current += spacing;
4805           if (f < n_th) {
4806             KMP_DEBUG_ASSERT(0 <= first);
4807             KMP_DEBUG_ASSERT(n_places > first);
4808             KMP_DEBUG_ASSERT(0 <= last);
4809             KMP_DEBUG_ASSERT(n_places > last);
4810             KMP_DEBUG_ASSERT(last_place >= first_place);
4811             th = team->t.t_threads[f];
4812             KMP_DEBUG_ASSERT(th);
4813             th->th.th_first_place = first;
4814             th->th.th_new_place = place;
4815             th->th.th_last_place = last;
4816             if (__kmp_display_affinity && place != th->th.th_current_place &&
4817                 team->t.t_display_affinity != 1) {
4818               team->t.t_display_affinity = 1;
4819             }
4820             KA_TRACE(100,
4821                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4822                       "partition = [%d,%d], spacing = %.4f\n",
4823                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4824                       team->t.t_id, f, th->th.th_new_place,
4825                       th->th.th_first_place, th->th.th_last_place, spacing));
4826           }
4827         }
4828       }
4829       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4830     } else {
4831       int S, rem, gap, s_count;
4832       S = n_th / n_places;
4833       s_count = 0;
4834       rem = n_th - (S * n_places);
4835       gap = rem > 0 ? n_places / rem : n_places;
4836       int place = masters_place;
4837       int gap_ct = gap;
4838       thidx = n_th;
4839       if (update_master_only == 1)
4840         thidx = 1;
4841       for (f = 0; f < thidx; f++) {
4842         kmp_info_t *th = team->t.t_threads[f];
4843         KMP_DEBUG_ASSERT(th != NULL);
4844 
4845         th->th.th_first_place = place;
4846         th->th.th_last_place = place;
4847         th->th.th_new_place = place;
4848         if (__kmp_display_affinity && place != th->th.th_current_place &&
4849             team->t.t_display_affinity != 1) {
4850           team->t.t_display_affinity = 1;
4851         }
4852         s_count++;
4853 
4854         if ((s_count == S) && rem && (gap_ct == gap)) {
4855           // do nothing, add an extra thread to place on next iteration
4856         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4857           // we added an extra thread to this place; move on to next place
4858           if (place == last_place) {
4859             place = first_place;
4860           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4861             place = 0;
4862           } else {
4863             place++;
4864           }
4865           s_count = 0;
4866           gap_ct = 1;
4867           rem--;
4868         } else if (s_count == S) { // place is full; don't add extra thread
4869           if (place == last_place) {
4870             place = first_place;
4871           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4872             place = 0;
4873           } else {
4874             place++;
4875           }
4876           gap_ct++;
4877           s_count = 0;
4878         }
4879 
4880         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4881                        "partition = [%d,%d]\n",
4882                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4883                        team->t.t_id, f, th->th.th_new_place,
4884                        th->th.th_first_place, th->th.th_last_place));
4885       }
4886       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4887     }
4888   } break;
4889 
4890   default:
4891     break;
4892   }
4893 
4894   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4895 }
4896 
4897 #endif // KMP_AFFINITY_SUPPORTED
4898 
4899 /* allocate a new team data structure to use.  take one off of the free pool if
4900    available */
4901 kmp_team_t *
4902 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4903 #if OMPT_SUPPORT
4904                     ompt_data_t ompt_parallel_data,
4905 #endif
4906                     kmp_proc_bind_t new_proc_bind,
4907                     kmp_internal_control_t *new_icvs,
4908                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4909   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4910   int f;
4911   kmp_team_t *team;
4912   int use_hot_team = !root->r.r_active;
4913   int level = 0;
4914 
4915   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4916   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4917   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4918   KMP_MB();
4919 
4920 #if KMP_NESTED_HOT_TEAMS
4921   kmp_hot_team_ptr_t *hot_teams;
4922   if (master) {
4923     team = master->th.th_team;
4924     level = team->t.t_active_level;
4925     if (master->th.th_teams_microtask) { // in teams construct?
4926       if (master->th.th_teams_size.nteams > 1 &&
4927           ( // #teams > 1
4928               team->t.t_pkfn ==
4929                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4930               master->th.th_teams_level <
4931                   team->t.t_level)) { // or nested parallel inside the teams
4932         ++level; // not increment if #teams==1, or for outer fork of the teams;
4933         // increment otherwise
4934       }
4935     }
4936     hot_teams = master->th.th_hot_teams;
4937     if (level < __kmp_hot_teams_max_level && hot_teams &&
4938         hot_teams[level]
4939             .hot_team) { // hot team has already been allocated for given level
4940       use_hot_team = 1;
4941     } else {
4942       use_hot_team = 0;
4943     }
4944   }
4945 #endif
4946   // Optimization to use a "hot" team
4947   if (use_hot_team && new_nproc > 1) {
4948     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4949 #if KMP_NESTED_HOT_TEAMS
4950     team = hot_teams[level].hot_team;
4951 #else
4952     team = root->r.r_hot_team;
4953 #endif
4954 #if KMP_DEBUG
4955     if (__kmp_tasking_mode != tskm_immediate_exec) {
4956       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4957                     "task_team[1] = %p before reinit\n",
4958                     team->t.t_task_team[0], team->t.t_task_team[1]));
4959     }
4960 #endif
4961 
4962     // Has the number of threads changed?
4963     /* Let's assume the most common case is that the number of threads is
4964        unchanged, and put that case first. */
4965     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4966       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4967       // This case can mean that omp_set_num_threads() was called and the hot
4968       // team size was already reduced, so we check the special flag
4969       if (team->t.t_size_changed == -1) {
4970         team->t.t_size_changed = 1;
4971       } else {
4972         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4973       }
4974 
4975       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4976       kmp_r_sched_t new_sched = new_icvs->sched;
4977       // set master's schedule as new run-time schedule
4978       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4979 
4980       __kmp_reinitialize_team(team, new_icvs,
4981                               root->r.r_uber_thread->th.th_ident);
4982 
4983       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4984                     team->t.t_threads[0], team));
4985       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4986 
4987 #if KMP_AFFINITY_SUPPORTED
4988       if ((team->t.t_size_changed == 0) &&
4989           (team->t.t_proc_bind == new_proc_bind)) {
4990         if (new_proc_bind == proc_bind_spread) {
4991           __kmp_partition_places(
4992               team, 1); // add flag to update only master for spread
4993         }
4994         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4995                        "proc_bind = %d, partition = [%d,%d]\n",
4996                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4997                        team->t.t_last_place));
4998       } else {
4999         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5000         __kmp_partition_places(team);
5001       }
5002 #else
5003       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5004 #endif /* KMP_AFFINITY_SUPPORTED */
5005     } else if (team->t.t_nproc > new_nproc) {
5006       KA_TRACE(20,
5007                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5008                 new_nproc));
5009 
5010       team->t.t_size_changed = 1;
5011 #if KMP_NESTED_HOT_TEAMS
5012       if (__kmp_hot_teams_mode == 0) {
5013         // AC: saved number of threads should correspond to team's value in this
5014         // mode, can be bigger in mode 1, when hot team has threads in reserve
5015         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5016         hot_teams[level].hot_team_nth = new_nproc;
5017 #endif // KMP_NESTED_HOT_TEAMS
5018         /* release the extra threads we don't need any more */
5019         for (f = new_nproc; f < team->t.t_nproc; f++) {
5020           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5021           if (__kmp_tasking_mode != tskm_immediate_exec) {
5022             // When decreasing team size, threads no longer in the team should
5023             // unref task team.
5024             team->t.t_threads[f]->th.th_task_team = NULL;
5025           }
5026           __kmp_free_thread(team->t.t_threads[f]);
5027           team->t.t_threads[f] = NULL;
5028         }
5029 #if KMP_NESTED_HOT_TEAMS
5030       } // (__kmp_hot_teams_mode == 0)
5031       else {
5032         // When keeping extra threads in team, switch threads to wait on own
5033         // b_go flag
5034         for (f = new_nproc; f < team->t.t_nproc; ++f) {
5035           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5036           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5037           for (int b = 0; b < bs_last_barrier; ++b) {
5038             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5039               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5040             }
5041             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5042           }
5043         }
5044       }
5045 #endif // KMP_NESTED_HOT_TEAMS
5046       team->t.t_nproc = new_nproc;
5047       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5048       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5049       __kmp_reinitialize_team(team, new_icvs,
5050                               root->r.r_uber_thread->th.th_ident);
5051 
5052       // Update remaining threads
5053       for (f = 0; f < new_nproc; ++f) {
5054         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5055       }
5056 
5057       // restore the current task state of the master thread: should be the
5058       // implicit task
5059       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5060                     team->t.t_threads[0], team));
5061 
5062       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5063 
5064 #ifdef KMP_DEBUG
5065       for (f = 0; f < team->t.t_nproc; f++) {
5066         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5067                          team->t.t_threads[f]->th.th_team_nproc ==
5068                              team->t.t_nproc);
5069       }
5070 #endif
5071 
5072       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5073 #if KMP_AFFINITY_SUPPORTED
5074       __kmp_partition_places(team);
5075 #endif
5076     } else { // team->t.t_nproc < new_nproc
5077 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5078       kmp_affin_mask_t *old_mask;
5079       if (KMP_AFFINITY_CAPABLE()) {
5080         KMP_CPU_ALLOC(old_mask);
5081       }
5082 #endif
5083 
5084       KA_TRACE(20,
5085                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5086                 new_nproc));
5087 
5088       team->t.t_size_changed = 1;
5089 
5090 #if KMP_NESTED_HOT_TEAMS
5091       int avail_threads = hot_teams[level].hot_team_nth;
5092       if (new_nproc < avail_threads)
5093         avail_threads = new_nproc;
5094       kmp_info_t **other_threads = team->t.t_threads;
5095       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5096         // Adjust barrier data of reserved threads (if any) of the team
5097         // Other data will be set in __kmp_initialize_info() below.
5098         int b;
5099         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5100         for (b = 0; b < bs_last_barrier; ++b) {
5101           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5102           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5103 #if USE_DEBUGGER
5104           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5105 #endif
5106         }
5107       }
5108       if (hot_teams[level].hot_team_nth >= new_nproc) {
5109         // we have all needed threads in reserve, no need to allocate any
5110         // this only possible in mode 1, cannot have reserved threads in mode 0
5111         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5112         team->t.t_nproc = new_nproc; // just get reserved threads involved
5113       } else {
5114         // we may have some threads in reserve, but not enough
5115         team->t.t_nproc =
5116             hot_teams[level]
5117                 .hot_team_nth; // get reserved threads involved if any
5118         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5119 #endif // KMP_NESTED_HOT_TEAMS
5120         if (team->t.t_max_nproc < new_nproc) {
5121           /* reallocate larger arrays */
5122           __kmp_reallocate_team_arrays(team, new_nproc);
5123           __kmp_reinitialize_team(team, new_icvs, NULL);
5124         }
5125 
5126 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5127         /* Temporarily set full mask for master thread before creation of
5128            workers. The reason is that workers inherit the affinity from master,
5129            so if a lot of workers are created on the single core quickly, they
5130            don't get a chance to set their own affinity for a long time. */
5131         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5132 #endif
5133 
5134         /* allocate new threads for the hot team */
5135         for (f = team->t.t_nproc; f < new_nproc; f++) {
5136           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5137           KMP_DEBUG_ASSERT(new_worker);
5138           team->t.t_threads[f] = new_worker;
5139 
5140           KA_TRACE(20,
5141                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5142                     "join=%llu, plain=%llu\n",
5143                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5144                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5145                     team->t.t_bar[bs_plain_barrier].b_arrived));
5146 
5147           { // Initialize barrier data for new threads.
5148             int b;
5149             kmp_balign_t *balign = new_worker->th.th_bar;
5150             for (b = 0; b < bs_last_barrier; ++b) {
5151               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5152               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5153                                KMP_BARRIER_PARENT_FLAG);
5154 #if USE_DEBUGGER
5155               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5156 #endif
5157             }
5158           }
5159         }
5160 
5161 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5162         if (KMP_AFFINITY_CAPABLE()) {
5163           /* Restore initial master thread's affinity mask */
5164           __kmp_set_system_affinity(old_mask, TRUE);
5165           KMP_CPU_FREE(old_mask);
5166         }
5167 #endif
5168 #if KMP_NESTED_HOT_TEAMS
5169       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5170 #endif // KMP_NESTED_HOT_TEAMS
5171       /* make sure everyone is syncronized */
5172       int old_nproc = team->t.t_nproc; // save old value and use to update only
5173       // new threads below
5174       __kmp_initialize_team(team, new_nproc, new_icvs,
5175                             root->r.r_uber_thread->th.th_ident);
5176 
5177       /* reinitialize the threads */
5178       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5179       for (f = 0; f < team->t.t_nproc; ++f)
5180         __kmp_initialize_info(team->t.t_threads[f], team, f,
5181                               __kmp_gtid_from_tid(f, team));
5182 
5183       if (level) { // set th_task_state for new threads in nested hot team
5184         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5185         // only need to set the th_task_state for the new threads. th_task_state
5186         // for master thread will not be accurate until after this in
5187         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5188         // correct value.
5189         for (f = old_nproc; f < team->t.t_nproc; ++f)
5190           team->t.t_threads[f]->th.th_task_state =
5191               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5192       } else { // set th_task_state for new threads in non-nested hot team
5193         int old_state =
5194             team->t.t_threads[0]->th.th_task_state; // copy master's state
5195         for (f = old_nproc; f < team->t.t_nproc; ++f)
5196           team->t.t_threads[f]->th.th_task_state = old_state;
5197       }
5198 
5199 #ifdef KMP_DEBUG
5200       for (f = 0; f < team->t.t_nproc; ++f) {
5201         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5202                          team->t.t_threads[f]->th.th_team_nproc ==
5203                              team->t.t_nproc);
5204       }
5205 #endif
5206 
5207       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5208 #if KMP_AFFINITY_SUPPORTED
5209       __kmp_partition_places(team);
5210 #endif
5211     } // Check changes in number of threads
5212 
5213     kmp_info_t *master = team->t.t_threads[0];
5214     if (master->th.th_teams_microtask) {
5215       for (f = 1; f < new_nproc; ++f) {
5216         // propagate teams construct specific info to workers
5217         kmp_info_t *thr = team->t.t_threads[f];
5218         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5219         thr->th.th_teams_level = master->th.th_teams_level;
5220         thr->th.th_teams_size = master->th.th_teams_size;
5221       }
5222     }
5223 #if KMP_NESTED_HOT_TEAMS
5224     if (level) {
5225       // Sync barrier state for nested hot teams, not needed for outermost hot
5226       // team.
5227       for (f = 1; f < new_nproc; ++f) {
5228         kmp_info_t *thr = team->t.t_threads[f];
5229         int b;
5230         kmp_balign_t *balign = thr->th.th_bar;
5231         for (b = 0; b < bs_last_barrier; ++b) {
5232           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5233           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5234 #if USE_DEBUGGER
5235           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5236 #endif
5237         }
5238       }
5239     }
5240 #endif // KMP_NESTED_HOT_TEAMS
5241 
5242     /* reallocate space for arguments if necessary */
5243     __kmp_alloc_argv_entries(argc, team, TRUE);
5244     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5245     // The hot team re-uses the previous task team,
5246     // if untouched during the previous release->gather phase.
5247 
5248     KF_TRACE(10, (" hot_team = %p\n", team));
5249 
5250 #if KMP_DEBUG
5251     if (__kmp_tasking_mode != tskm_immediate_exec) {
5252       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5253                     "task_team[1] = %p after reinit\n",
5254                     team->t.t_task_team[0], team->t.t_task_team[1]));
5255     }
5256 #endif
5257 
5258 #if OMPT_SUPPORT
5259     __ompt_team_assign_id(team, ompt_parallel_data);
5260 #endif
5261 
5262     KMP_MB();
5263 
5264     return team;
5265   }
5266 
5267   /* next, let's try to take one from the team pool */
5268   KMP_MB();
5269   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5270     /* TODO: consider resizing undersized teams instead of reaping them, now
5271        that we have a resizing mechanism */
5272     if (team->t.t_max_nproc >= max_nproc) {
5273       /* take this team from the team pool */
5274       __kmp_team_pool = team->t.t_next_pool;
5275 
5276       /* setup the team for fresh use */
5277       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5278 
5279       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5280                     "task_team[1] %p to NULL\n",
5281                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5282       team->t.t_task_team[0] = NULL;
5283       team->t.t_task_team[1] = NULL;
5284 
5285       /* reallocate space for arguments if necessary */
5286       __kmp_alloc_argv_entries(argc, team, TRUE);
5287       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5288 
5289       KA_TRACE(
5290           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5291                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5292       { // Initialize barrier data.
5293         int b;
5294         for (b = 0; b < bs_last_barrier; ++b) {
5295           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5296 #if USE_DEBUGGER
5297           team->t.t_bar[b].b_master_arrived = 0;
5298           team->t.t_bar[b].b_team_arrived = 0;
5299 #endif
5300         }
5301       }
5302 
5303       team->t.t_proc_bind = new_proc_bind;
5304 
5305       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5306                     team->t.t_id));
5307 
5308 #if OMPT_SUPPORT
5309       __ompt_team_assign_id(team, ompt_parallel_data);
5310 #endif
5311 
5312       KMP_MB();
5313 
5314       return team;
5315     }
5316 
5317     /* reap team if it is too small, then loop back and check the next one */
5318     // not sure if this is wise, but, will be redone during the hot-teams
5319     // rewrite.
5320     /* TODO: Use technique to find the right size hot-team, don't reap them */
5321     team = __kmp_reap_team(team);
5322     __kmp_team_pool = team;
5323   }
5324 
5325   /* nothing available in the pool, no matter, make a new team! */
5326   KMP_MB();
5327   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5328 
5329   /* and set it up */
5330   team->t.t_max_nproc = max_nproc;
5331   /* NOTE well, for some reason allocating one big buffer and dividing it up
5332      seems to really hurt performance a lot on the P4, so, let's not use this */
5333   __kmp_allocate_team_arrays(team, max_nproc);
5334 
5335   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5336   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5337 
5338   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5339                 "%p to NULL\n",
5340                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5341   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5342   // memory, no need to duplicate
5343   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5344   // memory, no need to duplicate
5345 
5346   if (__kmp_storage_map) {
5347     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5348   }
5349 
5350   /* allocate space for arguments */
5351   __kmp_alloc_argv_entries(argc, team, FALSE);
5352   team->t.t_argc = argc;
5353 
5354   KA_TRACE(20,
5355            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5356             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5357   { // Initialize barrier data.
5358     int b;
5359     for (b = 0; b < bs_last_barrier; ++b) {
5360       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5361 #if USE_DEBUGGER
5362       team->t.t_bar[b].b_master_arrived = 0;
5363       team->t.t_bar[b].b_team_arrived = 0;
5364 #endif
5365     }
5366   }
5367 
5368   team->t.t_proc_bind = new_proc_bind;
5369 
5370 #if OMPT_SUPPORT
5371   __ompt_team_assign_id(team, ompt_parallel_data);
5372   team->t.ompt_serialized_team_info = NULL;
5373 #endif
5374 
5375   KMP_MB();
5376 
5377   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5378                 team->t.t_id));
5379 
5380   return team;
5381 }
5382 
5383 /* TODO implement hot-teams at all levels */
5384 /* TODO implement lazy thread release on demand (disband request) */
5385 
5386 /* free the team.  return it to the team pool.  release all the threads
5387  * associated with it */
5388 void __kmp_free_team(kmp_root_t *root,
5389                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5390   int f;
5391   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5392                 team->t.t_id));
5393 
5394   /* verify state */
5395   KMP_DEBUG_ASSERT(root);
5396   KMP_DEBUG_ASSERT(team);
5397   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5398   KMP_DEBUG_ASSERT(team->t.t_threads);
5399 
5400   int use_hot_team = team == root->r.r_hot_team;
5401 #if KMP_NESTED_HOT_TEAMS
5402   int level;
5403   kmp_hot_team_ptr_t *hot_teams;
5404   if (master) {
5405     level = team->t.t_active_level - 1;
5406     if (master->th.th_teams_microtask) { // in teams construct?
5407       if (master->th.th_teams_size.nteams > 1) {
5408         ++level; // level was not increased in teams construct for
5409         // team_of_masters
5410       }
5411       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5412           master->th.th_teams_level == team->t.t_level) {
5413         ++level; // level was not increased in teams construct for
5414         // team_of_workers before the parallel
5415       } // team->t.t_level will be increased inside parallel
5416     }
5417     hot_teams = master->th.th_hot_teams;
5418     if (level < __kmp_hot_teams_max_level) {
5419       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5420       use_hot_team = 1;
5421     }
5422   }
5423 #endif // KMP_NESTED_HOT_TEAMS
5424 
5425   /* team is done working */
5426   TCW_SYNC_PTR(team->t.t_pkfn,
5427                NULL); // Important for Debugging Support Library.
5428 #if KMP_OS_WINDOWS
5429   team->t.t_copyin_counter = 0; // init counter for possible reuse
5430 #endif
5431   // Do not reset pointer to parent team to NULL for hot teams.
5432 
5433   /* if we are non-hot team, release our threads */
5434   if (!use_hot_team) {
5435     if (__kmp_tasking_mode != tskm_immediate_exec) {
5436       // Wait for threads to reach reapable state
5437       for (f = 1; f < team->t.t_nproc; ++f) {
5438         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5439         kmp_info_t *th = team->t.t_threads[f];
5440         volatile kmp_uint32 *state = &th->th.th_reap_state;
5441         while (*state != KMP_SAFE_TO_REAP) {
5442 #if KMP_OS_WINDOWS
5443           // On Windows a thread can be killed at any time, check this
5444           DWORD ecode;
5445           if (!__kmp_is_thread_alive(th, &ecode)) {
5446             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5447             break;
5448           }
5449 #endif
5450           // first check if thread is sleeping
5451           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5452           if (fl.is_sleeping())
5453             fl.resume(__kmp_gtid_from_thread(th));
5454           KMP_CPU_PAUSE();
5455         }
5456       }
5457 
5458       // Delete task teams
5459       int tt_idx;
5460       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5461         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5462         if (task_team != NULL) {
5463           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5464             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5465             team->t.t_threads[f]->th.th_task_team = NULL;
5466           }
5467           KA_TRACE(
5468               20,
5469               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5470                __kmp_get_gtid(), task_team, team->t.t_id));
5471 #if KMP_NESTED_HOT_TEAMS
5472           __kmp_free_task_team(master, task_team);
5473 #endif
5474           team->t.t_task_team[tt_idx] = NULL;
5475         }
5476       }
5477     }
5478 
5479     // Reset pointer to parent team only for non-hot teams.
5480     team->t.t_parent = NULL;
5481     team->t.t_level = 0;
5482     team->t.t_active_level = 0;
5483 
5484     /* free the worker threads */
5485     for (f = 1; f < team->t.t_nproc; ++f) {
5486       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5487       __kmp_free_thread(team->t.t_threads[f]);
5488       team->t.t_threads[f] = NULL;
5489     }
5490 
5491     /* put the team back in the team pool */
5492     /* TODO limit size of team pool, call reap_team if pool too large */
5493     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5494     __kmp_team_pool = (volatile kmp_team_t *)team;
5495   } else { // Check if team was created for the masters in a teams construct
5496     // See if first worker is a CG root
5497     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5498                      team->t.t_threads[1]->th.th_cg_roots);
5499     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5500       // Clean up the CG root nodes on workers so that this team can be re-used
5501       for (f = 1; f < team->t.t_nproc; ++f) {
5502         kmp_info_t *thr = team->t.t_threads[f];
5503         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5504                          thr->th.th_cg_roots->cg_root == thr);
5505         // Pop current CG root off list
5506         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5507         thr->th.th_cg_roots = tmp->up;
5508         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5509                        " up to node %p. cg_nthreads was %d\n",
5510                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5511         int i = tmp->cg_nthreads--;
5512         if (i == 1) {
5513           __kmp_free(tmp); // free CG if we are the last thread in it
5514         }
5515         // Restore current task's thread_limit from CG root
5516         if (thr->th.th_cg_roots)
5517           thr->th.th_current_task->td_icvs.thread_limit =
5518               thr->th.th_cg_roots->cg_thread_limit;
5519       }
5520     }
5521   }
5522 
5523   KMP_MB();
5524 }
5525 
5526 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5527 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5528   kmp_team_t *next_pool = team->t.t_next_pool;
5529 
5530   KMP_DEBUG_ASSERT(team);
5531   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5532   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5533   KMP_DEBUG_ASSERT(team->t.t_threads);
5534   KMP_DEBUG_ASSERT(team->t.t_argv);
5535 
5536   /* TODO clean the threads that are a part of this? */
5537 
5538   /* free stuff */
5539   __kmp_free_team_arrays(team);
5540   if (team->t.t_argv != &team->t.t_inline_argv[0])
5541     __kmp_free((void *)team->t.t_argv);
5542   __kmp_free(team);
5543 
5544   KMP_MB();
5545   return next_pool;
5546 }
5547 
5548 // Free the thread.  Don't reap it, just place it on the pool of available
5549 // threads.
5550 //
5551 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5552 // binding for the affinity mechanism to be useful.
5553 //
5554 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5555 // However, we want to avoid a potential performance problem by always
5556 // scanning through the list to find the correct point at which to insert
5557 // the thread (potential N**2 behavior).  To do this we keep track of the
5558 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5559 // With single-level parallelism, threads will always be added to the tail
5560 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5561 // parallelism, all bets are off and we may need to scan through the entire
5562 // free list.
5563 //
5564 // This change also has a potentially large performance benefit, for some
5565 // applications.  Previously, as threads were freed from the hot team, they
5566 // would be placed back on the free list in inverse order.  If the hot team
5567 // grew back to it's original size, then the freed thread would be placed
5568 // back on the hot team in reverse order.  This could cause bad cache
5569 // locality problems on programs where the size of the hot team regularly
5570 // grew and shrunk.
5571 //
5572 // Now, for single-level parallelism, the OMP tid is always == gtid.
5573 void __kmp_free_thread(kmp_info_t *this_th) {
5574   int gtid;
5575   kmp_info_t **scan;
5576 
5577   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5578                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5579 
5580   KMP_DEBUG_ASSERT(this_th);
5581 
5582   // When moving thread to pool, switch thread to wait on own b_go flag, and
5583   // uninitialized (NULL team).
5584   int b;
5585   kmp_balign_t *balign = this_th->th.th_bar;
5586   for (b = 0; b < bs_last_barrier; ++b) {
5587     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5588       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5589     balign[b].bb.team = NULL;
5590     balign[b].bb.leaf_kids = 0;
5591   }
5592   this_th->th.th_task_state = 0;
5593   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5594 
5595   /* put thread back on the free pool */
5596   TCW_PTR(this_th->th.th_team, NULL);
5597   TCW_PTR(this_th->th.th_root, NULL);
5598   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5599 
5600   while (this_th->th.th_cg_roots) {
5601     this_th->th.th_cg_roots->cg_nthreads--;
5602     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5603                    " %p of thread  %p to %d\n",
5604                    this_th, this_th->th.th_cg_roots,
5605                    this_th->th.th_cg_roots->cg_root,
5606                    this_th->th.th_cg_roots->cg_nthreads));
5607     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5608     if (tmp->cg_root == this_th) { // Thread is a cg_root
5609       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5610       KA_TRACE(
5611           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5612       this_th->th.th_cg_roots = tmp->up;
5613       __kmp_free(tmp);
5614     } else { // Worker thread
5615       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5616         __kmp_free(tmp);
5617       }
5618       this_th->th.th_cg_roots = NULL;
5619       break;
5620     }
5621   }
5622 
5623   /* If the implicit task assigned to this thread can be used by other threads
5624    * -> multiple threads can share the data and try to free the task at
5625    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5626    * with higher probability when hot team is disabled but can occurs even when
5627    * the hot team is enabled */
5628   __kmp_free_implicit_task(this_th);
5629   this_th->th.th_current_task = NULL;
5630 
5631   // If the __kmp_thread_pool_insert_pt is already past the new insert
5632   // point, then we need to re-scan the entire list.
5633   gtid = this_th->th.th_info.ds.ds_gtid;
5634   if (__kmp_thread_pool_insert_pt != NULL) {
5635     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5636     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5637       __kmp_thread_pool_insert_pt = NULL;
5638     }
5639   }
5640 
5641   // Scan down the list to find the place to insert the thread.
5642   // scan is the address of a link in the list, possibly the address of
5643   // __kmp_thread_pool itself.
5644   //
5645   // In the absence of nested parallelism, the for loop will have 0 iterations.
5646   if (__kmp_thread_pool_insert_pt != NULL) {
5647     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5648   } else {
5649     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5650   }
5651   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5652        scan = &((*scan)->th.th_next_pool))
5653     ;
5654 
5655   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5656   // to its address.
5657   TCW_PTR(this_th->th.th_next_pool, *scan);
5658   __kmp_thread_pool_insert_pt = *scan = this_th;
5659   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5660                    (this_th->th.th_info.ds.ds_gtid <
5661                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5662   TCW_4(this_th->th.th_in_pool, TRUE);
5663   __kmp_suspend_initialize_thread(this_th);
5664   __kmp_lock_suspend_mx(this_th);
5665   if (this_th->th.th_active == TRUE) {
5666     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5667     this_th->th.th_active_in_pool = TRUE;
5668   }
5669 #if KMP_DEBUG
5670   else {
5671     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5672   }
5673 #endif
5674   __kmp_unlock_suspend_mx(this_th);
5675 
5676   TCW_4(__kmp_nth, __kmp_nth - 1);
5677 
5678 #ifdef KMP_ADJUST_BLOCKTIME
5679   /* Adjust blocktime back to user setting or default if necessary */
5680   /* Middle initialization might never have occurred                */
5681   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5682     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5683     if (__kmp_nth <= __kmp_avail_proc) {
5684       __kmp_zero_bt = FALSE;
5685     }
5686   }
5687 #endif /* KMP_ADJUST_BLOCKTIME */
5688 
5689   KMP_MB();
5690 }
5691 
5692 /* ------------------------------------------------------------------------ */
5693 
5694 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5695   int gtid = this_thr->th.th_info.ds.ds_gtid;
5696   /*    void                 *stack_data;*/
5697   kmp_team_t **volatile pteam;
5698 
5699   KMP_MB();
5700   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5701 
5702   if (__kmp_env_consistency_check) {
5703     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5704   }
5705 
5706 #if OMPT_SUPPORT
5707   ompt_data_t *thread_data;
5708   if (ompt_enabled.enabled) {
5709     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5710     *thread_data = ompt_data_none;
5711 
5712     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5713     this_thr->th.ompt_thread_info.wait_id = 0;
5714     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5715     this_thr->th.ompt_thread_info.parallel_flags = 0;
5716     if (ompt_enabled.ompt_callback_thread_begin) {
5717       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5718           ompt_thread_worker, thread_data);
5719     }
5720     this_thr->th.ompt_thread_info.state = ompt_state_idle;
5721   }
5722 #endif
5723 
5724   /* This is the place where threads wait for work */
5725   while (!TCR_4(__kmp_global.g.g_done)) {
5726     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5727     KMP_MB();
5728 
5729     /* wait for work to do */
5730     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5731 
5732     /* No tid yet since not part of a team */
5733     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5734 
5735 #if OMPT_SUPPORT
5736     if (ompt_enabled.enabled) {
5737       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5738     }
5739 #endif
5740 
5741     pteam = &this_thr->th.th_team;
5742 
5743     /* have we been allocated? */
5744     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5745       /* we were just woken up, so run our new task */
5746       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5747         int rc;
5748         KA_TRACE(20,
5749                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5750                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5751                   (*pteam)->t.t_pkfn));
5752 
5753         updateHWFPControl(*pteam);
5754 
5755 #if OMPT_SUPPORT
5756         if (ompt_enabled.enabled) {
5757           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5758         }
5759 #endif
5760 
5761         rc = (*pteam)->t.t_invoke(gtid);
5762         KMP_ASSERT(rc);
5763 
5764         KMP_MB();
5765         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5766                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5767                       (*pteam)->t.t_pkfn));
5768       }
5769 #if OMPT_SUPPORT
5770       if (ompt_enabled.enabled) {
5771         /* no frame set while outside task */
5772         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5773 
5774         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5775       }
5776 #endif
5777       /* join barrier after parallel region */
5778       __kmp_join_barrier(gtid);
5779     }
5780   }
5781   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5782 
5783 #if OMPT_SUPPORT
5784   if (ompt_enabled.ompt_callback_thread_end) {
5785     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5786   }
5787 #endif
5788 
5789   this_thr->th.th_task_team = NULL;
5790   /* run the destructors for the threadprivate data for this thread */
5791   __kmp_common_destroy_gtid(gtid);
5792 
5793   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5794   KMP_MB();
5795   return this_thr;
5796 }
5797 
5798 /* ------------------------------------------------------------------------ */
5799 
5800 void __kmp_internal_end_dest(void *specific_gtid) {
5801 #if KMP_COMPILER_ICC
5802 #pragma warning(push)
5803 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5804 // significant bits
5805 #endif
5806   // Make sure no significant bits are lost
5807   int gtid = (kmp_intptr_t)specific_gtid - 1;
5808 #if KMP_COMPILER_ICC
5809 #pragma warning(pop)
5810 #endif
5811 
5812   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5813   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5814    * this is because 0 is reserved for the nothing-stored case */
5815 
5816   /* josh: One reason for setting the gtid specific data even when it is being
5817      destroyed by pthread is to allow gtid lookup through thread specific data
5818      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5819      that gets executed in the call to __kmp_internal_end_thread, actually
5820      gets the gtid through the thread specific data.  Setting it here seems
5821      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5822      to run smoothly.
5823      todo: get rid of this after we remove the dependence on
5824      __kmp_gtid_get_specific  */
5825   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5826     __kmp_gtid_set_specific(gtid);
5827 #ifdef KMP_TDATA_GTID
5828   __kmp_gtid = gtid;
5829 #endif
5830   __kmp_internal_end_thread(gtid);
5831 }
5832 
5833 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5834 
5835 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5836   __kmp_internal_end_atexit();
5837 }
5838 
5839 #endif
5840 
5841 /* [Windows] josh: when the atexit handler is called, there may still be more
5842    than one thread alive */
5843 void __kmp_internal_end_atexit(void) {
5844   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5845   /* [Windows]
5846      josh: ideally, we want to completely shutdown the library in this atexit
5847      handler, but stat code that depends on thread specific data for gtid fails
5848      because that data becomes unavailable at some point during the shutdown, so
5849      we call __kmp_internal_end_thread instead. We should eventually remove the
5850      dependency on __kmp_get_specific_gtid in the stat code and use
5851      __kmp_internal_end_library to cleanly shutdown the library.
5852 
5853      // TODO: Can some of this comment about GVS be removed?
5854      I suspect that the offending stat code is executed when the calling thread
5855      tries to clean up a dead root thread's data structures, resulting in GVS
5856      code trying to close the GVS structures for that thread, but since the stat
5857      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5858      the calling thread is cleaning up itself instead of another thread, it get
5859      confused. This happens because allowing a thread to unregister and cleanup
5860      another thread is a recent modification for addressing an issue.
5861      Based on the current design (20050722), a thread may end up
5862      trying to unregister another thread only if thread death does not trigger
5863      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5864      thread specific data destructor function to detect thread death. For
5865      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5866      is nothing.  Thus, the workaround is applicable only for Windows static
5867      stat library. */
5868   __kmp_internal_end_library(-1);
5869 #if KMP_OS_WINDOWS
5870   __kmp_close_console();
5871 #endif
5872 }
5873 
5874 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5875   // It is assumed __kmp_forkjoin_lock is acquired.
5876 
5877   int gtid;
5878 
5879   KMP_DEBUG_ASSERT(thread != NULL);
5880 
5881   gtid = thread->th.th_info.ds.ds_gtid;
5882 
5883   if (!is_root) {
5884     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5885       /* Assume the threads are at the fork barrier here */
5886       KA_TRACE(
5887           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5888                gtid));
5889       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5890        * (GEH) */
5891       ANNOTATE_HAPPENS_BEFORE(thread);
5892       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5893       __kmp_release_64(&flag);
5894     }
5895 
5896     // Terminate OS thread.
5897     __kmp_reap_worker(thread);
5898 
5899     // The thread was killed asynchronously.  If it was actively
5900     // spinning in the thread pool, decrement the global count.
5901     //
5902     // There is a small timing hole here - if the worker thread was just waking
5903     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5904     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5905     // the global counter might not get updated.
5906     //
5907     // Currently, this can only happen as the library is unloaded,
5908     // so there are no harmful side effects.
5909     if (thread->th.th_active_in_pool) {
5910       thread->th.th_active_in_pool = FALSE;
5911       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5912       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5913     }
5914   }
5915 
5916   __kmp_free_implicit_task(thread);
5917 
5918 // Free the fast memory for tasking
5919 #if USE_FAST_MEMORY
5920   __kmp_free_fast_memory(thread);
5921 #endif /* USE_FAST_MEMORY */
5922 
5923   __kmp_suspend_uninitialize_thread(thread);
5924 
5925   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5926   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5927 
5928   --__kmp_all_nth;
5929 // __kmp_nth was decremented when thread is added to the pool.
5930 
5931 #ifdef KMP_ADJUST_BLOCKTIME
5932   /* Adjust blocktime back to user setting or default if necessary */
5933   /* Middle initialization might never have occurred                */
5934   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5935     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5936     if (__kmp_nth <= __kmp_avail_proc) {
5937       __kmp_zero_bt = FALSE;
5938     }
5939   }
5940 #endif /* KMP_ADJUST_BLOCKTIME */
5941 
5942   /* free the memory being used */
5943   if (__kmp_env_consistency_check) {
5944     if (thread->th.th_cons) {
5945       __kmp_free_cons_stack(thread->th.th_cons);
5946       thread->th.th_cons = NULL;
5947     }
5948   }
5949 
5950   if (thread->th.th_pri_common != NULL) {
5951     __kmp_free(thread->th.th_pri_common);
5952     thread->th.th_pri_common = NULL;
5953   }
5954 
5955   if (thread->th.th_task_state_memo_stack != NULL) {
5956     __kmp_free(thread->th.th_task_state_memo_stack);
5957     thread->th.th_task_state_memo_stack = NULL;
5958   }
5959 
5960 #if KMP_USE_BGET
5961   if (thread->th.th_local.bget_data != NULL) {
5962     __kmp_finalize_bget(thread);
5963   }
5964 #endif
5965 
5966 #if KMP_AFFINITY_SUPPORTED
5967   if (thread->th.th_affin_mask != NULL) {
5968     KMP_CPU_FREE(thread->th.th_affin_mask);
5969     thread->th.th_affin_mask = NULL;
5970   }
5971 #endif /* KMP_AFFINITY_SUPPORTED */
5972 
5973 #if KMP_USE_HIER_SCHED
5974   if (thread->th.th_hier_bar_data != NULL) {
5975     __kmp_free(thread->th.th_hier_bar_data);
5976     thread->th.th_hier_bar_data = NULL;
5977   }
5978 #endif
5979 
5980   __kmp_reap_team(thread->th.th_serial_team);
5981   thread->th.th_serial_team = NULL;
5982   __kmp_free(thread);
5983 
5984   KMP_MB();
5985 
5986 } // __kmp_reap_thread
5987 
5988 static void __kmp_internal_end(void) {
5989   int i;
5990 
5991   /* First, unregister the library */
5992   __kmp_unregister_library();
5993 
5994 #if KMP_OS_WINDOWS
5995   /* In Win static library, we can't tell when a root actually dies, so we
5996      reclaim the data structures for any root threads that have died but not
5997      unregistered themselves, in order to shut down cleanly.
5998      In Win dynamic library we also can't tell when a thread dies.  */
5999   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6000 // dead roots
6001 #endif
6002 
6003   for (i = 0; i < __kmp_threads_capacity; i++)
6004     if (__kmp_root[i])
6005       if (__kmp_root[i]->r.r_active)
6006         break;
6007   KMP_MB(); /* Flush all pending memory write invalidates.  */
6008   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6009 
6010   if (i < __kmp_threads_capacity) {
6011 #if KMP_USE_MONITOR
6012     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6013     KMP_MB(); /* Flush all pending memory write invalidates.  */
6014 
6015     // Need to check that monitor was initialized before reaping it. If we are
6016     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6017     // __kmp_monitor will appear to contain valid data, but it is only valid in
6018     // the parent process, not the child.
6019     // New behavior (201008): instead of keying off of the flag
6020     // __kmp_init_parallel, the monitor thread creation is keyed off
6021     // of the new flag __kmp_init_monitor.
6022     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6023     if (TCR_4(__kmp_init_monitor)) {
6024       __kmp_reap_monitor(&__kmp_monitor);
6025       TCW_4(__kmp_init_monitor, 0);
6026     }
6027     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6028     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6029 #endif // KMP_USE_MONITOR
6030   } else {
6031 /* TODO move this to cleanup code */
6032 #ifdef KMP_DEBUG
6033     /* make sure that everything has properly ended */
6034     for (i = 0; i < __kmp_threads_capacity; i++) {
6035       if (__kmp_root[i]) {
6036         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6037         //                    there can be uber threads alive here
6038         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6039       }
6040     }
6041 #endif
6042 
6043     KMP_MB();
6044 
6045     // Reap the worker threads.
6046     // This is valid for now, but be careful if threads are reaped sooner.
6047     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6048       // Get the next thread from the pool.
6049       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6050       __kmp_thread_pool = thread->th.th_next_pool;
6051       // Reap it.
6052       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6053       thread->th.th_next_pool = NULL;
6054       thread->th.th_in_pool = FALSE;
6055       __kmp_reap_thread(thread, 0);
6056     }
6057     __kmp_thread_pool_insert_pt = NULL;
6058 
6059     // Reap teams.
6060     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6061       // Get the next team from the pool.
6062       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6063       __kmp_team_pool = team->t.t_next_pool;
6064       // Reap it.
6065       team->t.t_next_pool = NULL;
6066       __kmp_reap_team(team);
6067     }
6068 
6069     __kmp_reap_task_teams();
6070 
6071 #if KMP_OS_UNIX
6072     // Threads that are not reaped should not access any resources since they
6073     // are going to be deallocated soon, so the shutdown sequence should wait
6074     // until all threads either exit the final spin-waiting loop or begin
6075     // sleeping after the given blocktime.
6076     for (i = 0; i < __kmp_threads_capacity; i++) {
6077       kmp_info_t *thr = __kmp_threads[i];
6078       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6079         KMP_CPU_PAUSE();
6080     }
6081 #endif
6082 
6083     for (i = 0; i < __kmp_threads_capacity; ++i) {
6084       // TBD: Add some checking...
6085       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6086     }
6087 
6088     /* Make sure all threadprivate destructors get run by joining with all
6089        worker threads before resetting this flag */
6090     TCW_SYNC_4(__kmp_init_common, FALSE);
6091 
6092     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6093     KMP_MB();
6094 
6095 #if KMP_USE_MONITOR
6096     // See note above: One of the possible fixes for CQ138434 / CQ140126
6097     //
6098     // FIXME: push both code fragments down and CSE them?
6099     // push them into __kmp_cleanup() ?
6100     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6101     if (TCR_4(__kmp_init_monitor)) {
6102       __kmp_reap_monitor(&__kmp_monitor);
6103       TCW_4(__kmp_init_monitor, 0);
6104     }
6105     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6106     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6107 #endif
6108   } /* else !__kmp_global.t_active */
6109   TCW_4(__kmp_init_gtid, FALSE);
6110   KMP_MB(); /* Flush all pending memory write invalidates.  */
6111 
6112   __kmp_cleanup();
6113 #if OMPT_SUPPORT
6114   ompt_fini();
6115 #endif
6116 }
6117 
6118 void __kmp_internal_end_library(int gtid_req) {
6119   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6120   /* this shouldn't be a race condition because __kmp_internal_end() is the
6121      only place to clear __kmp_serial_init */
6122   /* we'll check this later too, after we get the lock */
6123   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6124   // redundant, because the next check will work in any case.
6125   if (__kmp_global.g.g_abort) {
6126     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6127     /* TODO abort? */
6128     return;
6129   }
6130   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6131     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6132     return;
6133   }
6134 
6135   KMP_MB(); /* Flush all pending memory write invalidates.  */
6136 
6137   /* find out who we are and what we should do */
6138   {
6139     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6140     KA_TRACE(
6141         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6142     if (gtid == KMP_GTID_SHUTDOWN) {
6143       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6144                     "already shutdown\n"));
6145       return;
6146     } else if (gtid == KMP_GTID_MONITOR) {
6147       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6148                     "registered, or system shutdown\n"));
6149       return;
6150     } else if (gtid == KMP_GTID_DNE) {
6151       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6152                     "shutdown\n"));
6153       /* we don't know who we are, but we may still shutdown the library */
6154     } else if (KMP_UBER_GTID(gtid)) {
6155       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6156       if (__kmp_root[gtid]->r.r_active) {
6157         __kmp_global.g.g_abort = -1;
6158         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6159         KA_TRACE(10,
6160                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6161                   gtid));
6162         return;
6163       } else {
6164         KA_TRACE(
6165             10,
6166             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6167         __kmp_unregister_root_current_thread(gtid);
6168       }
6169     } else {
6170 /* worker threads may call this function through the atexit handler, if they
6171  * call exit() */
6172 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6173    TODO: do a thorough shutdown instead */
6174 #ifdef DUMP_DEBUG_ON_EXIT
6175       if (__kmp_debug_buf)
6176         __kmp_dump_debug_buffer();
6177 #endif
6178       return;
6179     }
6180   }
6181   /* synchronize the termination process */
6182   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6183 
6184   /* have we already finished */
6185   if (__kmp_global.g.g_abort) {
6186     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6187     /* TODO abort? */
6188     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6189     return;
6190   }
6191   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6192     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6193     return;
6194   }
6195 
6196   /* We need this lock to enforce mutex between this reading of
6197      __kmp_threads_capacity and the writing by __kmp_register_root.
6198      Alternatively, we can use a counter of roots that is atomically updated by
6199      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6200      __kmp_internal_end_*.  */
6201   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6202 
6203   /* now we can safely conduct the actual termination */
6204   __kmp_internal_end();
6205 
6206   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6207   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6208 
6209   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6210 
6211 #ifdef DUMP_DEBUG_ON_EXIT
6212   if (__kmp_debug_buf)
6213     __kmp_dump_debug_buffer();
6214 #endif
6215 
6216 #if KMP_OS_WINDOWS
6217   __kmp_close_console();
6218 #endif
6219 
6220   __kmp_fini_allocator();
6221 
6222 } // __kmp_internal_end_library
6223 
6224 void __kmp_internal_end_thread(int gtid_req) {
6225   int i;
6226 
6227   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6228   /* this shouldn't be a race condition because __kmp_internal_end() is the
6229    * only place to clear __kmp_serial_init */
6230   /* we'll check this later too, after we get the lock */
6231   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6232   // redundant, because the next check will work in any case.
6233   if (__kmp_global.g.g_abort) {
6234     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6235     /* TODO abort? */
6236     return;
6237   }
6238   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6239     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6240     return;
6241   }
6242 
6243   KMP_MB(); /* Flush all pending memory write invalidates.  */
6244 
6245   /* find out who we are and what we should do */
6246   {
6247     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6248     KA_TRACE(10,
6249              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6250     if (gtid == KMP_GTID_SHUTDOWN) {
6251       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6252                     "already shutdown\n"));
6253       return;
6254     } else if (gtid == KMP_GTID_MONITOR) {
6255       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6256                     "registered, or system shutdown\n"));
6257       return;
6258     } else if (gtid == KMP_GTID_DNE) {
6259       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6260                     "shutdown\n"));
6261       return;
6262       /* we don't know who we are */
6263     } else if (KMP_UBER_GTID(gtid)) {
6264       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6265       if (__kmp_root[gtid]->r.r_active) {
6266         __kmp_global.g.g_abort = -1;
6267         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6268         KA_TRACE(10,
6269                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6270                   gtid));
6271         return;
6272       } else {
6273         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6274                       gtid));
6275         __kmp_unregister_root_current_thread(gtid);
6276       }
6277     } else {
6278       /* just a worker thread, let's leave */
6279       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6280 
6281       if (gtid >= 0) {
6282         __kmp_threads[gtid]->th.th_task_team = NULL;
6283       }
6284 
6285       KA_TRACE(10,
6286                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6287                 gtid));
6288       return;
6289     }
6290   }
6291 #if KMP_DYNAMIC_LIB
6292   if (__kmp_pause_status != kmp_hard_paused)
6293   // AC: lets not shutdown the dynamic library at the exit of uber thread,
6294   // because we will better shutdown later in the library destructor.
6295   {
6296     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6297     return;
6298   }
6299 #endif
6300   /* synchronize the termination process */
6301   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6302 
6303   /* have we already finished */
6304   if (__kmp_global.g.g_abort) {
6305     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6306     /* TODO abort? */
6307     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6308     return;
6309   }
6310   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6311     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6312     return;
6313   }
6314 
6315   /* We need this lock to enforce mutex between this reading of
6316      __kmp_threads_capacity and the writing by __kmp_register_root.
6317      Alternatively, we can use a counter of roots that is atomically updated by
6318      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6319      __kmp_internal_end_*.  */
6320 
6321   /* should we finish the run-time?  are all siblings done? */
6322   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6323 
6324   for (i = 0; i < __kmp_threads_capacity; ++i) {
6325     if (KMP_UBER_GTID(i)) {
6326       KA_TRACE(
6327           10,
6328           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6329       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6330       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6331       return;
6332     }
6333   }
6334 
6335   /* now we can safely conduct the actual termination */
6336 
6337   __kmp_internal_end();
6338 
6339   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6340   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6341 
6342   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6343 
6344 #ifdef DUMP_DEBUG_ON_EXIT
6345   if (__kmp_debug_buf)
6346     __kmp_dump_debug_buffer();
6347 #endif
6348 } // __kmp_internal_end_thread
6349 
6350 // -----------------------------------------------------------------------------
6351 // Library registration stuff.
6352 
6353 static long __kmp_registration_flag = 0;
6354 // Random value used to indicate library initialization.
6355 static char *__kmp_registration_str = NULL;
6356 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6357 
6358 static inline char *__kmp_reg_status_name() {
6359   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6360      each thread. If registration and unregistration go in different threads
6361      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6362      env var can not be found, because the name will contain different pid. */
6363   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6364 } // __kmp_reg_status_get
6365 
6366 void __kmp_register_library_startup(void) {
6367 
6368   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6369   int done = 0;
6370   union {
6371     double dtime;
6372     long ltime;
6373   } time;
6374 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6375   __kmp_initialize_system_tick();
6376 #endif
6377   __kmp_read_system_time(&time.dtime);
6378   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6379   __kmp_registration_str =
6380       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6381                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6382 
6383   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6384                 __kmp_registration_str));
6385 
6386   while (!done) {
6387 
6388     char *value = NULL; // Actual value of the environment variable.
6389 
6390     // Set environment variable, but do not overwrite if it is exist.
6391     __kmp_env_set(name, __kmp_registration_str, 0);
6392     // Check the variable is written.
6393     value = __kmp_env_get(name);
6394     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6395 
6396       done = 1; // Ok, environment variable set successfully, exit the loop.
6397 
6398     } else {
6399 
6400       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6401       // Check whether it alive or dead.
6402       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6403       char *tail = value;
6404       char *flag_addr_str = NULL;
6405       char *flag_val_str = NULL;
6406       char const *file_name = NULL;
6407       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6408       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6409       file_name = tail;
6410       if (tail != NULL) {
6411         long *flag_addr = 0;
6412         long flag_val = 0;
6413         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6414         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6415         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6416           // First, check whether environment-encoded address is mapped into
6417           // addr space.
6418           // If so, dereference it to see if it still has the right value.
6419           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6420             neighbor = 1;
6421           } else {
6422             // If not, then we know the other copy of the library is no longer
6423             // running.
6424             neighbor = 2;
6425           }
6426         }
6427       }
6428       switch (neighbor) {
6429       case 0: // Cannot parse environment variable -- neighbor status unknown.
6430         // Assume it is the incompatible format of future version of the
6431         // library. Assume the other library is alive.
6432         // WARN( ... ); // TODO: Issue a warning.
6433         file_name = "unknown library";
6434         KMP_FALLTHROUGH();
6435       // Attention! Falling to the next case. That's intentional.
6436       case 1: { // Neighbor is alive.
6437         // Check it is allowed.
6438         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6439         if (!__kmp_str_match_true(duplicate_ok)) {
6440           // That's not allowed. Issue fatal error.
6441           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6442                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6443         }
6444         KMP_INTERNAL_FREE(duplicate_ok);
6445         __kmp_duplicate_library_ok = 1;
6446         done = 1; // Exit the loop.
6447       } break;
6448       case 2: { // Neighbor is dead.
6449         // Clear the variable and try to register library again.
6450         __kmp_env_unset(name);
6451       } break;
6452       default: { KMP_DEBUG_ASSERT(0); } break;
6453       }
6454     }
6455     KMP_INTERNAL_FREE((void *)value);
6456   }
6457   KMP_INTERNAL_FREE((void *)name);
6458 
6459 } // func __kmp_register_library_startup
6460 
6461 void __kmp_unregister_library(void) {
6462 
6463   char *name = __kmp_reg_status_name();
6464   char *value = __kmp_env_get(name);
6465 
6466   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6467   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6468   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6469     // Ok, this is our variable. Delete it.
6470     __kmp_env_unset(name);
6471   }
6472 
6473   KMP_INTERNAL_FREE(__kmp_registration_str);
6474   KMP_INTERNAL_FREE(value);
6475   KMP_INTERNAL_FREE(name);
6476 
6477   __kmp_registration_flag = 0;
6478   __kmp_registration_str = NULL;
6479 
6480 } // __kmp_unregister_library
6481 
6482 // End of Library registration stuff.
6483 // -----------------------------------------------------------------------------
6484 
6485 #if KMP_MIC_SUPPORTED
6486 
6487 static void __kmp_check_mic_type() {
6488   kmp_cpuid_t cpuid_state = {0};
6489   kmp_cpuid_t *cs_p = &cpuid_state;
6490   __kmp_x86_cpuid(1, 0, cs_p);
6491   // We don't support mic1 at the moment
6492   if ((cs_p->eax & 0xff0) == 0xB10) {
6493     __kmp_mic_type = mic2;
6494   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6495     __kmp_mic_type = mic3;
6496   } else {
6497     __kmp_mic_type = non_mic;
6498   }
6499 }
6500 
6501 #endif /* KMP_MIC_SUPPORTED */
6502 
6503 static void __kmp_do_serial_initialize(void) {
6504   int i, gtid;
6505   int size;
6506 
6507   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6508 
6509   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6510   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6511   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6512   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6513   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6514 
6515 #if OMPT_SUPPORT
6516   ompt_pre_init();
6517 #endif
6518 
6519   __kmp_validate_locks();
6520 
6521   /* Initialize internal memory allocator */
6522   __kmp_init_allocator();
6523 
6524   /* Register the library startup via an environment variable and check to see
6525      whether another copy of the library is already registered. */
6526 
6527   __kmp_register_library_startup();
6528 
6529   /* TODO reinitialization of library */
6530   if (TCR_4(__kmp_global.g.g_done)) {
6531     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6532   }
6533 
6534   __kmp_global.g.g_abort = 0;
6535   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6536 
6537 /* initialize the locks */
6538 #if KMP_USE_ADAPTIVE_LOCKS
6539 #if KMP_DEBUG_ADAPTIVE_LOCKS
6540   __kmp_init_speculative_stats();
6541 #endif
6542 #endif
6543 #if KMP_STATS_ENABLED
6544   __kmp_stats_init();
6545 #endif
6546   __kmp_init_lock(&__kmp_global_lock);
6547   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6548   __kmp_init_lock(&__kmp_debug_lock);
6549   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6550   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6551   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6552   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6553   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6554   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6555   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6556   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6557   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6558   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6559   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6560   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6561   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6562   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6563   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6564 #if KMP_USE_MONITOR
6565   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6566 #endif
6567   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6568 
6569   /* conduct initialization and initial setup of configuration */
6570 
6571   __kmp_runtime_initialize();
6572 
6573 #if KMP_MIC_SUPPORTED
6574   __kmp_check_mic_type();
6575 #endif
6576 
6577 // Some global variable initialization moved here from kmp_env_initialize()
6578 #ifdef KMP_DEBUG
6579   kmp_diag = 0;
6580 #endif
6581   __kmp_abort_delay = 0;
6582 
6583   // From __kmp_init_dflt_team_nth()
6584   /* assume the entire machine will be used */
6585   __kmp_dflt_team_nth_ub = __kmp_xproc;
6586   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6587     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6588   }
6589   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6590     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6591   }
6592   __kmp_max_nth = __kmp_sys_max_nth;
6593   __kmp_cg_max_nth = __kmp_sys_max_nth;
6594   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6595   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6596     __kmp_teams_max_nth = __kmp_sys_max_nth;
6597   }
6598 
6599   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6600   // part
6601   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6602 #if KMP_USE_MONITOR
6603   __kmp_monitor_wakeups =
6604       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6605   __kmp_bt_intervals =
6606       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6607 #endif
6608   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6609   __kmp_library = library_throughput;
6610   // From KMP_SCHEDULE initialization
6611   __kmp_static = kmp_sch_static_balanced;
6612 // AC: do not use analytical here, because it is non-monotonous
6613 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6614 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6615 // need to repeat assignment
6616 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6617 // bit control and barrier method control parts
6618 #if KMP_FAST_REDUCTION_BARRIER
6619 #define kmp_reduction_barrier_gather_bb ((int)1)
6620 #define kmp_reduction_barrier_release_bb ((int)1)
6621 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6622 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6623 #endif // KMP_FAST_REDUCTION_BARRIER
6624   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6625     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6626     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6627     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6628     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6629 #if KMP_FAST_REDUCTION_BARRIER
6630     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6631       // lin_64 ): hyper,1
6632       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6633       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6634       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6635       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6636     }
6637 #endif // KMP_FAST_REDUCTION_BARRIER
6638   }
6639 #if KMP_FAST_REDUCTION_BARRIER
6640 #undef kmp_reduction_barrier_release_pat
6641 #undef kmp_reduction_barrier_gather_pat
6642 #undef kmp_reduction_barrier_release_bb
6643 #undef kmp_reduction_barrier_gather_bb
6644 #endif // KMP_FAST_REDUCTION_BARRIER
6645 #if KMP_MIC_SUPPORTED
6646   if (__kmp_mic_type == mic2) { // KNC
6647     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6648     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6649     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6650         1; // forkjoin release
6651     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6652     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6653   }
6654 #if KMP_FAST_REDUCTION_BARRIER
6655   if (__kmp_mic_type == mic2) { // KNC
6656     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6657     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6658   }
6659 #endif // KMP_FAST_REDUCTION_BARRIER
6660 #endif // KMP_MIC_SUPPORTED
6661 
6662 // From KMP_CHECKS initialization
6663 #ifdef KMP_DEBUG
6664   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6665 #else
6666   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6667 #endif
6668 
6669   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6670   __kmp_foreign_tp = TRUE;
6671 
6672   __kmp_global.g.g_dynamic = FALSE;
6673   __kmp_global.g.g_dynamic_mode = dynamic_default;
6674 
6675   __kmp_env_initialize(NULL);
6676 
6677 // Print all messages in message catalog for testing purposes.
6678 #ifdef KMP_DEBUG
6679   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6680   if (__kmp_str_match_true(val)) {
6681     kmp_str_buf_t buffer;
6682     __kmp_str_buf_init(&buffer);
6683     __kmp_i18n_dump_catalog(&buffer);
6684     __kmp_printf("%s", buffer.str);
6685     __kmp_str_buf_free(&buffer);
6686   }
6687   __kmp_env_free(&val);
6688 #endif
6689 
6690   __kmp_threads_capacity =
6691       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6692   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6693   __kmp_tp_capacity = __kmp_default_tp_capacity(
6694       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6695 
6696   // If the library is shut down properly, both pools must be NULL. Just in
6697   // case, set them to NULL -- some memory may leak, but subsequent code will
6698   // work even if pools are not freed.
6699   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6700   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6701   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6702   __kmp_thread_pool = NULL;
6703   __kmp_thread_pool_insert_pt = NULL;
6704   __kmp_team_pool = NULL;
6705 
6706   /* Allocate all of the variable sized records */
6707   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6708    * expandable */
6709   /* Since allocation is cache-aligned, just add extra padding at the end */
6710   size =
6711       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6712       CACHE_LINE;
6713   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6714   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6715                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6716 
6717   /* init thread counts */
6718   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6719                    0); // Asserts fail if the library is reinitializing and
6720   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6721   __kmp_all_nth = 0;
6722   __kmp_nth = 0;
6723 
6724   /* setup the uber master thread and hierarchy */
6725   gtid = __kmp_register_root(TRUE);
6726   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6727   KMP_ASSERT(KMP_UBER_GTID(gtid));
6728   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6729 
6730   KMP_MB(); /* Flush all pending memory write invalidates.  */
6731 
6732   __kmp_common_initialize();
6733 
6734 #if KMP_OS_UNIX
6735   /* invoke the child fork handler */
6736   __kmp_register_atfork();
6737 #endif
6738 
6739 #if !KMP_DYNAMIC_LIB
6740   {
6741     /* Invoke the exit handler when the program finishes, only for static
6742        library. For dynamic library, we already have _fini and DllMain. */
6743     int rc = atexit(__kmp_internal_end_atexit);
6744     if (rc != 0) {
6745       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6746                   __kmp_msg_null);
6747     }
6748   }
6749 #endif
6750 
6751 #if KMP_HANDLE_SIGNALS
6752 #if KMP_OS_UNIX
6753   /* NOTE: make sure that this is called before the user installs their own
6754      signal handlers so that the user handlers are called first. this way they
6755      can return false, not call our handler, avoid terminating the library, and
6756      continue execution where they left off. */
6757   __kmp_install_signals(FALSE);
6758 #endif /* KMP_OS_UNIX */
6759 #if KMP_OS_WINDOWS
6760   __kmp_install_signals(TRUE);
6761 #endif /* KMP_OS_WINDOWS */
6762 #endif
6763 
6764   /* we have finished the serial initialization */
6765   __kmp_init_counter++;
6766 
6767   __kmp_init_serial = TRUE;
6768 
6769   if (__kmp_settings) {
6770     __kmp_env_print();
6771   }
6772 
6773   if (__kmp_display_env || __kmp_display_env_verbose) {
6774     __kmp_env_print_2();
6775   }
6776 
6777 #if OMPT_SUPPORT
6778   ompt_post_init();
6779 #endif
6780 
6781   KMP_MB();
6782 
6783   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6784 }
6785 
6786 void __kmp_serial_initialize(void) {
6787   if (__kmp_init_serial) {
6788     return;
6789   }
6790   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6791   if (__kmp_init_serial) {
6792     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6793     return;
6794   }
6795   __kmp_do_serial_initialize();
6796   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6797 }
6798 
6799 static void __kmp_do_middle_initialize(void) {
6800   int i, j;
6801   int prev_dflt_team_nth;
6802 
6803   if (!__kmp_init_serial) {
6804     __kmp_do_serial_initialize();
6805   }
6806 
6807   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6808 
6809   // Save the previous value for the __kmp_dflt_team_nth so that
6810   // we can avoid some reinitialization if it hasn't changed.
6811   prev_dflt_team_nth = __kmp_dflt_team_nth;
6812 
6813 #if KMP_AFFINITY_SUPPORTED
6814   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6815   // number of cores on the machine.
6816   __kmp_affinity_initialize();
6817 
6818   // Run through the __kmp_threads array and set the affinity mask
6819   // for each root thread that is currently registered with the RTL.
6820   for (i = 0; i < __kmp_threads_capacity; i++) {
6821     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6822       __kmp_affinity_set_init_mask(i, TRUE);
6823     }
6824   }
6825 #endif /* KMP_AFFINITY_SUPPORTED */
6826 
6827   KMP_ASSERT(__kmp_xproc > 0);
6828   if (__kmp_avail_proc == 0) {
6829     __kmp_avail_proc = __kmp_xproc;
6830   }
6831 
6832   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6833   // correct them now
6834   j = 0;
6835   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6836     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6837         __kmp_avail_proc;
6838     j++;
6839   }
6840 
6841   if (__kmp_dflt_team_nth == 0) {
6842 #ifdef KMP_DFLT_NTH_CORES
6843     // Default #threads = #cores
6844     __kmp_dflt_team_nth = __kmp_ncores;
6845     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6846                   "__kmp_ncores (%d)\n",
6847                   __kmp_dflt_team_nth));
6848 #else
6849     // Default #threads = #available OS procs
6850     __kmp_dflt_team_nth = __kmp_avail_proc;
6851     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6852                   "__kmp_avail_proc(%d)\n",
6853                   __kmp_dflt_team_nth));
6854 #endif /* KMP_DFLT_NTH_CORES */
6855   }
6856 
6857   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6858     __kmp_dflt_team_nth = KMP_MIN_NTH;
6859   }
6860   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6861     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6862   }
6863 
6864   // There's no harm in continuing if the following check fails,
6865   // but it indicates an error in the previous logic.
6866   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6867 
6868   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6869     // Run through the __kmp_threads array and set the num threads icv for each
6870     // root thread that is currently registered with the RTL (which has not
6871     // already explicitly set its nthreads-var with a call to
6872     // omp_set_num_threads()).
6873     for (i = 0; i < __kmp_threads_capacity; i++) {
6874       kmp_info_t *thread = __kmp_threads[i];
6875       if (thread == NULL)
6876         continue;
6877       if (thread->th.th_current_task->td_icvs.nproc != 0)
6878         continue;
6879 
6880       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6881     }
6882   }
6883   KA_TRACE(
6884       20,
6885       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6886        __kmp_dflt_team_nth));
6887 
6888 #ifdef KMP_ADJUST_BLOCKTIME
6889   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6890   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6891     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6892     if (__kmp_nth > __kmp_avail_proc) {
6893       __kmp_zero_bt = TRUE;
6894     }
6895   }
6896 #endif /* KMP_ADJUST_BLOCKTIME */
6897 
6898   /* we have finished middle initialization */
6899   TCW_SYNC_4(__kmp_init_middle, TRUE);
6900 
6901   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6902 }
6903 
6904 void __kmp_middle_initialize(void) {
6905   if (__kmp_init_middle) {
6906     return;
6907   }
6908   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6909   if (__kmp_init_middle) {
6910     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6911     return;
6912   }
6913   __kmp_do_middle_initialize();
6914   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6915 }
6916 
6917 void __kmp_parallel_initialize(void) {
6918   int gtid = __kmp_entry_gtid(); // this might be a new root
6919 
6920   /* synchronize parallel initialization (for sibling) */
6921   if (TCR_4(__kmp_init_parallel))
6922     return;
6923   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6924   if (TCR_4(__kmp_init_parallel)) {
6925     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6926     return;
6927   }
6928 
6929   /* TODO reinitialization after we have already shut down */
6930   if (TCR_4(__kmp_global.g.g_done)) {
6931     KA_TRACE(
6932         10,
6933         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6934     __kmp_infinite_loop();
6935   }
6936 
6937   /* jc: The lock __kmp_initz_lock is already held, so calling
6938      __kmp_serial_initialize would cause a deadlock.  So we call
6939      __kmp_do_serial_initialize directly. */
6940   if (!__kmp_init_middle) {
6941     __kmp_do_middle_initialize();
6942   }
6943   __kmp_resume_if_hard_paused();
6944 
6945   /* begin initialization */
6946   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6947   KMP_ASSERT(KMP_UBER_GTID(gtid));
6948 
6949 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6950   // Save the FP control regs.
6951   // Worker threads will set theirs to these values at thread startup.
6952   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6953   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6954   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6955 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6956 
6957 #if KMP_OS_UNIX
6958 #if KMP_HANDLE_SIGNALS
6959   /*  must be after __kmp_serial_initialize  */
6960   __kmp_install_signals(TRUE);
6961 #endif
6962 #endif
6963 
6964   __kmp_suspend_initialize();
6965 
6966 #if defined(USE_LOAD_BALANCE)
6967   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6968     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6969   }
6970 #else
6971   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6972     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6973   }
6974 #endif
6975 
6976   if (__kmp_version) {
6977     __kmp_print_version_2();
6978   }
6979 
6980   /* we have finished parallel initialization */
6981   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6982 
6983   KMP_MB();
6984   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6985 
6986   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6987 }
6988 
6989 /* ------------------------------------------------------------------------ */
6990 
6991 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6992                                    kmp_team_t *team) {
6993   kmp_disp_t *dispatch;
6994 
6995   KMP_MB();
6996 
6997   /* none of the threads have encountered any constructs, yet. */
6998   this_thr->th.th_local.this_construct = 0;
6999 #if KMP_CACHE_MANAGE
7000   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7001 #endif /* KMP_CACHE_MANAGE */
7002   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7003   KMP_DEBUG_ASSERT(dispatch);
7004   KMP_DEBUG_ASSERT(team->t.t_dispatch);
7005   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7006   // this_thr->th.th_info.ds.ds_tid ] );
7007 
7008   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7009   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7010   if (__kmp_env_consistency_check)
7011     __kmp_push_parallel(gtid, team->t.t_ident);
7012 
7013   KMP_MB(); /* Flush all pending memory write invalidates.  */
7014 }
7015 
7016 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7017                                   kmp_team_t *team) {
7018   if (__kmp_env_consistency_check)
7019     __kmp_pop_parallel(gtid, team->t.t_ident);
7020 
7021   __kmp_finish_implicit_task(this_thr);
7022 }
7023 
7024 int __kmp_invoke_task_func(int gtid) {
7025   int rc;
7026   int tid = __kmp_tid_from_gtid(gtid);
7027   kmp_info_t *this_thr = __kmp_threads[gtid];
7028   kmp_team_t *team = this_thr->th.th_team;
7029 
7030   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7031 #if USE_ITT_BUILD
7032   if (__itt_stack_caller_create_ptr) {
7033     __kmp_itt_stack_callee_enter(
7034         (__itt_caller)
7035             team->t.t_stack_id); // inform ittnotify about entering user's code
7036   }
7037 #endif /* USE_ITT_BUILD */
7038 #if INCLUDE_SSC_MARKS
7039   SSC_MARK_INVOKING();
7040 #endif
7041 
7042 #if OMPT_SUPPORT
7043   void *dummy;
7044   void **exit_frame_p;
7045   ompt_data_t *my_task_data;
7046   ompt_data_t *my_parallel_data;
7047   int ompt_team_size;
7048 
7049   if (ompt_enabled.enabled) {
7050     exit_frame_p = &(
7051         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7052   } else {
7053     exit_frame_p = &dummy;
7054   }
7055 
7056   my_task_data =
7057       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7058   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7059   if (ompt_enabled.ompt_callback_implicit_task) {
7060     ompt_team_size = team->t.t_nproc;
7061     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7062         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7063         __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7064     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7065   }
7066 #endif
7067 
7068 #if KMP_STATS_ENABLED
7069   stats_state_e previous_state = KMP_GET_THREAD_STATE();
7070   if (previous_state == stats_state_e::TEAMS_REGION) {
7071     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7072   } else {
7073     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7074   }
7075   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7076 #endif
7077 
7078   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7079                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
7080 #if OMPT_SUPPORT
7081                               ,
7082                               exit_frame_p
7083 #endif
7084                               );
7085 #if OMPT_SUPPORT
7086   *exit_frame_p = NULL;
7087    this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7088 #endif
7089 
7090 #if KMP_STATS_ENABLED
7091   if (previous_state == stats_state_e::TEAMS_REGION) {
7092     KMP_SET_THREAD_STATE(previous_state);
7093   }
7094   KMP_POP_PARTITIONED_TIMER();
7095 #endif
7096 
7097 #if USE_ITT_BUILD
7098   if (__itt_stack_caller_create_ptr) {
7099     __kmp_itt_stack_callee_leave(
7100         (__itt_caller)
7101             team->t.t_stack_id); // inform ittnotify about leaving user's code
7102   }
7103 #endif /* USE_ITT_BUILD */
7104   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7105 
7106   return rc;
7107 }
7108 
7109 void __kmp_teams_master(int gtid) {
7110   // This routine is called by all master threads in teams construct
7111   kmp_info_t *thr = __kmp_threads[gtid];
7112   kmp_team_t *team = thr->th.th_team;
7113   ident_t *loc = team->t.t_ident;
7114   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7115   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7116   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7117   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7118                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7119 
7120   // This thread is a new CG root.  Set up the proper variables.
7121   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7122   tmp->cg_root = thr; // Make thr the CG root
7123   // Init to thread limit that was stored when league masters were forked
7124   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7125   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7126   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7127                  " cg_nthreads to 1\n",
7128                  thr, tmp));
7129   tmp->up = thr->th.th_cg_roots;
7130   thr->th.th_cg_roots = tmp;
7131 
7132 // Launch league of teams now, but not let workers execute
7133 // (they hang on fork barrier until next parallel)
7134 #if INCLUDE_SSC_MARKS
7135   SSC_MARK_FORKING();
7136 #endif
7137   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7138                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7139                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7140 #if INCLUDE_SSC_MARKS
7141   SSC_MARK_JOINING();
7142 #endif
7143   // If the team size was reduced from the limit, set it to the new size
7144   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7145     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7146   // AC: last parameter "1" eliminates join barrier which won't work because
7147   // worker threads are in a fork barrier waiting for more parallel regions
7148   __kmp_join_call(loc, gtid
7149 #if OMPT_SUPPORT
7150                   ,
7151                   fork_context_intel
7152 #endif
7153                   ,
7154                   1);
7155 }
7156 
7157 int __kmp_invoke_teams_master(int gtid) {
7158   kmp_info_t *this_thr = __kmp_threads[gtid];
7159   kmp_team_t *team = this_thr->th.th_team;
7160 #if KMP_DEBUG
7161   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7162     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7163                      (void *)__kmp_teams_master);
7164 #endif
7165   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7166 #if OMPT_SUPPORT
7167   int tid = __kmp_tid_from_gtid(gtid);
7168   ompt_data_t *task_data =
7169       &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7170   ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7171   if (ompt_enabled.ompt_callback_implicit_task) {
7172     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7173         ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7174         ompt_task_initial);
7175     OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7176   }
7177 #endif
7178   __kmp_teams_master(gtid);
7179 #if OMPT_SUPPORT
7180   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7181 #endif
7182   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7183   return 1;
7184 }
7185 
7186 /* this sets the requested number of threads for the next parallel region
7187    encountered by this team. since this should be enclosed in the forkjoin
7188    critical section it should avoid race conditions with asymmetrical nested
7189    parallelism */
7190 
7191 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7192   kmp_info_t *thr = __kmp_threads[gtid];
7193 
7194   if (num_threads > 0)
7195     thr->th.th_set_nproc = num_threads;
7196 }
7197 
7198 /* this sets the requested number of teams for the teams region and/or
7199    the number of threads for the next parallel region encountered  */
7200 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7201                           int num_threads) {
7202   kmp_info_t *thr = __kmp_threads[gtid];
7203   KMP_DEBUG_ASSERT(num_teams >= 0);
7204   KMP_DEBUG_ASSERT(num_threads >= 0);
7205 
7206   if (num_teams == 0)
7207     num_teams = 1; // default number of teams is 1.
7208   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7209     if (!__kmp_reserve_warn) {
7210       __kmp_reserve_warn = 1;
7211       __kmp_msg(kmp_ms_warning,
7212                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7213                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7214     }
7215     num_teams = __kmp_teams_max_nth;
7216   }
7217   // Set number of teams (number of threads in the outer "parallel" of the
7218   // teams)
7219   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7220 
7221   // Remember the number of threads for inner parallel regions
7222   if (!TCR_4(__kmp_init_middle))
7223     __kmp_middle_initialize(); // get internal globals calculated
7224   KMP_DEBUG_ASSERT(__kmp_avail_proc);
7225   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7226   if (num_threads == 0) {
7227     num_threads = __kmp_avail_proc / num_teams;
7228     // adjust num_threads w/o warning as it is not user setting
7229     // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7230     // no thread_limit clause specified -  do not change thread-limit-var ICV
7231     if (num_threads > __kmp_dflt_team_nth) {
7232       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7233     }
7234     if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7235       num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7236     } // prevent team size to exceed thread-limit-var
7237     if (num_teams * num_threads > __kmp_teams_max_nth) {
7238       num_threads = __kmp_teams_max_nth / num_teams;
7239     }
7240   } else {
7241     // This thread will be the master of the league masters
7242     // Store new thread limit; old limit is saved in th_cg_roots list
7243     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7244     // num_threads = min(num_threads, nthreads-var)
7245     if (num_threads > __kmp_dflt_team_nth) {
7246       num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7247     }
7248     if (num_teams * num_threads > __kmp_teams_max_nth) {
7249       int new_threads = __kmp_teams_max_nth / num_teams;
7250       if (!__kmp_reserve_warn) { // user asked for too many threads
7251         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7252         __kmp_msg(kmp_ms_warning,
7253                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7254                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7255       }
7256       num_threads = new_threads;
7257     }
7258   }
7259   thr->th.th_teams_size.nth = num_threads;
7260 }
7261 
7262 // Set the proc_bind var to use in the following parallel region.
7263 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7264   kmp_info_t *thr = __kmp_threads[gtid];
7265   thr->th.th_set_proc_bind = proc_bind;
7266 }
7267 
7268 /* Launch the worker threads into the microtask. */
7269 
7270 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7271   kmp_info_t *this_thr = __kmp_threads[gtid];
7272 
7273 #ifdef KMP_DEBUG
7274   int f;
7275 #endif /* KMP_DEBUG */
7276 
7277   KMP_DEBUG_ASSERT(team);
7278   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7279   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7280   KMP_MB(); /* Flush all pending memory write invalidates.  */
7281 
7282   team->t.t_construct = 0; /* no single directives seen yet */
7283   team->t.t_ordered.dt.t_value =
7284       0; /* thread 0 enters the ordered section first */
7285 
7286   /* Reset the identifiers on the dispatch buffer */
7287   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7288   if (team->t.t_max_nproc > 1) {
7289     int i;
7290     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7291       team->t.t_disp_buffer[i].buffer_index = i;
7292       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7293     }
7294   } else {
7295     team->t.t_disp_buffer[0].buffer_index = 0;
7296     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7297   }
7298 
7299   KMP_MB(); /* Flush all pending memory write invalidates.  */
7300   KMP_ASSERT(this_thr->th.th_team == team);
7301 
7302 #ifdef KMP_DEBUG
7303   for (f = 0; f < team->t.t_nproc; f++) {
7304     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7305                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7306   }
7307 #endif /* KMP_DEBUG */
7308 
7309   /* release the worker threads so they may begin working */
7310   __kmp_fork_barrier(gtid, 0);
7311 }
7312 
7313 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7314   kmp_info_t *this_thr = __kmp_threads[gtid];
7315 
7316   KMP_DEBUG_ASSERT(team);
7317   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7318   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7319   KMP_MB(); /* Flush all pending memory write invalidates.  */
7320 
7321 /* Join barrier after fork */
7322 
7323 #ifdef KMP_DEBUG
7324   if (__kmp_threads[gtid] &&
7325       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7326     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7327                  __kmp_threads[gtid]);
7328     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7329                  "team->t.t_nproc=%d\n",
7330                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7331                  team->t.t_nproc);
7332     __kmp_print_structure();
7333   }
7334   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7335                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7336 #endif /* KMP_DEBUG */
7337 
7338   __kmp_join_barrier(gtid); /* wait for everyone */
7339 #if OMPT_SUPPORT
7340   if (ompt_enabled.enabled &&
7341       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7342     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7343     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7344     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7345 #if OMPT_OPTIONAL
7346     void *codeptr = NULL;
7347     if (KMP_MASTER_TID(ds_tid) &&
7348         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7349          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7350       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7351 
7352     if (ompt_enabled.ompt_callback_sync_region_wait) {
7353       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7354           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7355           codeptr);
7356     }
7357     if (ompt_enabled.ompt_callback_sync_region) {
7358       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7359           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7360           codeptr);
7361     }
7362 #endif
7363     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7364       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7365           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7366     }
7367   }
7368 #endif
7369 
7370   KMP_MB(); /* Flush all pending memory write invalidates.  */
7371   KMP_ASSERT(this_thr->th.th_team == team);
7372 }
7373 
7374 /* ------------------------------------------------------------------------ */
7375 
7376 #ifdef USE_LOAD_BALANCE
7377 
7378 // Return the worker threads actively spinning in the hot team, if we
7379 // are at the outermost level of parallelism.  Otherwise, return 0.
7380 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7381   int i;
7382   int retval;
7383   kmp_team_t *hot_team;
7384 
7385   if (root->r.r_active) {
7386     return 0;
7387   }
7388   hot_team = root->r.r_hot_team;
7389   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7390     return hot_team->t.t_nproc - 1; // Don't count master thread
7391   }
7392 
7393   // Skip the master thread - it is accounted for elsewhere.
7394   retval = 0;
7395   for (i = 1; i < hot_team->t.t_nproc; i++) {
7396     if (hot_team->t.t_threads[i]->th.th_active) {
7397       retval++;
7398     }
7399   }
7400   return retval;
7401 }
7402 
7403 // Perform an automatic adjustment to the number of
7404 // threads used by the next parallel region.
7405 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7406   int retval;
7407   int pool_active;
7408   int hot_team_active;
7409   int team_curr_active;
7410   int system_active;
7411 
7412   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7413                 set_nproc));
7414   KMP_DEBUG_ASSERT(root);
7415   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7416                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7417   KMP_DEBUG_ASSERT(set_nproc > 1);
7418 
7419   if (set_nproc == 1) {
7420     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7421     return 1;
7422   }
7423 
7424   // Threads that are active in the thread pool, active in the hot team for this
7425   // particular root (if we are at the outer par level), and the currently
7426   // executing thread (to become the master) are available to add to the new
7427   // team, but are currently contributing to the system load, and must be
7428   // accounted for.
7429   pool_active = __kmp_thread_pool_active_nth;
7430   hot_team_active = __kmp_active_hot_team_nproc(root);
7431   team_curr_active = pool_active + hot_team_active + 1;
7432 
7433   // Check the system load.
7434   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7435   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7436                 "hot team active = %d\n",
7437                 system_active, pool_active, hot_team_active));
7438 
7439   if (system_active < 0) {
7440     // There was an error reading the necessary info from /proc, so use the
7441     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7442     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7443     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7444     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7445 
7446     // Make this call behave like the thread limit algorithm.
7447     retval = __kmp_avail_proc - __kmp_nth +
7448              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7449     if (retval > set_nproc) {
7450       retval = set_nproc;
7451     }
7452     if (retval < KMP_MIN_NTH) {
7453       retval = KMP_MIN_NTH;
7454     }
7455 
7456     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7457                   retval));
7458     return retval;
7459   }
7460 
7461   // There is a slight delay in the load balance algorithm in detecting new
7462   // running procs. The real system load at this instant should be at least as
7463   // large as the #active omp thread that are available to add to the team.
7464   if (system_active < team_curr_active) {
7465     system_active = team_curr_active;
7466   }
7467   retval = __kmp_avail_proc - system_active + team_curr_active;
7468   if (retval > set_nproc) {
7469     retval = set_nproc;
7470   }
7471   if (retval < KMP_MIN_NTH) {
7472     retval = KMP_MIN_NTH;
7473   }
7474 
7475   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7476   return retval;
7477 } // __kmp_load_balance_nproc()
7478 
7479 #endif /* USE_LOAD_BALANCE */
7480 
7481 /* ------------------------------------------------------------------------ */
7482 
7483 /* NOTE: this is called with the __kmp_init_lock held */
7484 void __kmp_cleanup(void) {
7485   int f;
7486 
7487   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7488 
7489   if (TCR_4(__kmp_init_parallel)) {
7490 #if KMP_HANDLE_SIGNALS
7491     __kmp_remove_signals();
7492 #endif
7493     TCW_4(__kmp_init_parallel, FALSE);
7494   }
7495 
7496   if (TCR_4(__kmp_init_middle)) {
7497 #if KMP_AFFINITY_SUPPORTED
7498     __kmp_affinity_uninitialize();
7499 #endif /* KMP_AFFINITY_SUPPORTED */
7500     __kmp_cleanup_hierarchy();
7501     TCW_4(__kmp_init_middle, FALSE);
7502   }
7503 
7504   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7505 
7506   if (__kmp_init_serial) {
7507     __kmp_runtime_destroy();
7508     __kmp_init_serial = FALSE;
7509   }
7510 
7511   __kmp_cleanup_threadprivate_caches();
7512 
7513   for (f = 0; f < __kmp_threads_capacity; f++) {
7514     if (__kmp_root[f] != NULL) {
7515       __kmp_free(__kmp_root[f]);
7516       __kmp_root[f] = NULL;
7517     }
7518   }
7519   __kmp_free(__kmp_threads);
7520   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7521   // there is no need in freeing __kmp_root.
7522   __kmp_threads = NULL;
7523   __kmp_root = NULL;
7524   __kmp_threads_capacity = 0;
7525 
7526 #if KMP_USE_DYNAMIC_LOCK
7527   __kmp_cleanup_indirect_user_locks();
7528 #else
7529   __kmp_cleanup_user_locks();
7530 #endif
7531 
7532 #if KMP_AFFINITY_SUPPORTED
7533   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7534   __kmp_cpuinfo_file = NULL;
7535 #endif /* KMP_AFFINITY_SUPPORTED */
7536 
7537 #if KMP_USE_ADAPTIVE_LOCKS
7538 #if KMP_DEBUG_ADAPTIVE_LOCKS
7539   __kmp_print_speculative_stats();
7540 #endif
7541 #endif
7542   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7543   __kmp_nested_nth.nth = NULL;
7544   __kmp_nested_nth.size = 0;
7545   __kmp_nested_nth.used = 0;
7546   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7547   __kmp_nested_proc_bind.bind_types = NULL;
7548   __kmp_nested_proc_bind.size = 0;
7549   __kmp_nested_proc_bind.used = 0;
7550   if (__kmp_affinity_format) {
7551     KMP_INTERNAL_FREE(__kmp_affinity_format);
7552     __kmp_affinity_format = NULL;
7553   }
7554 
7555   __kmp_i18n_catclose();
7556 
7557 #if KMP_USE_HIER_SCHED
7558   __kmp_hier_scheds.deallocate();
7559 #endif
7560 
7561 #if KMP_STATS_ENABLED
7562   __kmp_stats_fini();
7563 #endif
7564 
7565   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7566 }
7567 
7568 /* ------------------------------------------------------------------------ */
7569 
7570 int __kmp_ignore_mppbeg(void) {
7571   char *env;
7572 
7573   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7574     if (__kmp_str_match_false(env))
7575       return FALSE;
7576   }
7577   // By default __kmpc_begin() is no-op.
7578   return TRUE;
7579 }
7580 
7581 int __kmp_ignore_mppend(void) {
7582   char *env;
7583 
7584   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7585     if (__kmp_str_match_false(env))
7586       return FALSE;
7587   }
7588   // By default __kmpc_end() is no-op.
7589   return TRUE;
7590 }
7591 
7592 void __kmp_internal_begin(void) {
7593   int gtid;
7594   kmp_root_t *root;
7595 
7596   /* this is a very important step as it will register new sibling threads
7597      and assign these new uber threads a new gtid */
7598   gtid = __kmp_entry_gtid();
7599   root = __kmp_threads[gtid]->th.th_root;
7600   KMP_ASSERT(KMP_UBER_GTID(gtid));
7601 
7602   if (root->r.r_begin)
7603     return;
7604   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7605   if (root->r.r_begin) {
7606     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7607     return;
7608   }
7609 
7610   root->r.r_begin = TRUE;
7611 
7612   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7613 }
7614 
7615 /* ------------------------------------------------------------------------ */
7616 
7617 void __kmp_user_set_library(enum library_type arg) {
7618   int gtid;
7619   kmp_root_t *root;
7620   kmp_info_t *thread;
7621 
7622   /* first, make sure we are initialized so we can get our gtid */
7623 
7624   gtid = __kmp_entry_gtid();
7625   thread = __kmp_threads[gtid];
7626 
7627   root = thread->th.th_root;
7628 
7629   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7630                 library_serial));
7631   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7632                                   thread */
7633     KMP_WARNING(SetLibraryIncorrectCall);
7634     return;
7635   }
7636 
7637   switch (arg) {
7638   case library_serial:
7639     thread->th.th_set_nproc = 0;
7640     set__nproc(thread, 1);
7641     break;
7642   case library_turnaround:
7643     thread->th.th_set_nproc = 0;
7644     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7645                                            : __kmp_dflt_team_nth_ub);
7646     break;
7647   case library_throughput:
7648     thread->th.th_set_nproc = 0;
7649     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7650                                            : __kmp_dflt_team_nth_ub);
7651     break;
7652   default:
7653     KMP_FATAL(UnknownLibraryType, arg);
7654   }
7655 
7656   __kmp_aux_set_library(arg);
7657 }
7658 
7659 void __kmp_aux_set_stacksize(size_t arg) {
7660   if (!__kmp_init_serial)
7661     __kmp_serial_initialize();
7662 
7663 #if KMP_OS_DARWIN
7664   if (arg & (0x1000 - 1)) {
7665     arg &= ~(0x1000 - 1);
7666     if (arg + 0x1000) /* check for overflow if we round up */
7667       arg += 0x1000;
7668   }
7669 #endif
7670   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7671 
7672   /* only change the default stacksize before the first parallel region */
7673   if (!TCR_4(__kmp_init_parallel)) {
7674     size_t value = arg; /* argument is in bytes */
7675 
7676     if (value < __kmp_sys_min_stksize)
7677       value = __kmp_sys_min_stksize;
7678     else if (value > KMP_MAX_STKSIZE)
7679       value = KMP_MAX_STKSIZE;
7680 
7681     __kmp_stksize = value;
7682 
7683     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7684   }
7685 
7686   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7687 }
7688 
7689 /* set the behaviour of the runtime library */
7690 /* TODO this can cause some odd behaviour with sibling parallelism... */
7691 void __kmp_aux_set_library(enum library_type arg) {
7692   __kmp_library = arg;
7693 
7694   switch (__kmp_library) {
7695   case library_serial: {
7696     KMP_INFORM(LibraryIsSerial);
7697   } break;
7698   case library_turnaround:
7699     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7700       __kmp_use_yield = 2; // only yield when oversubscribed
7701     break;
7702   case library_throughput:
7703     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7704       __kmp_dflt_blocktime = 200;
7705     break;
7706   default:
7707     KMP_FATAL(UnknownLibraryType, arg);
7708   }
7709 }
7710 
7711 /* Getting team information common for all team API */
7712 // Returns NULL if not in teams construct
7713 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7714   kmp_info_t *thr = __kmp_entry_thread();
7715   teams_serialized = 0;
7716   if (thr->th.th_teams_microtask) {
7717     kmp_team_t *team = thr->th.th_team;
7718     int tlevel = thr->th.th_teams_level; // the level of the teams construct
7719     int ii = team->t.t_level;
7720     teams_serialized = team->t.t_serialized;
7721     int level = tlevel + 1;
7722     KMP_DEBUG_ASSERT(ii >= tlevel);
7723     while (ii > level) {
7724       for (teams_serialized = team->t.t_serialized;
7725            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7726       }
7727       if (team->t.t_serialized && (!teams_serialized)) {
7728         team = team->t.t_parent;
7729         continue;
7730       }
7731       if (ii > level) {
7732         team = team->t.t_parent;
7733         ii--;
7734       }
7735     }
7736     return team;
7737   }
7738   return NULL;
7739 }
7740 
7741 int __kmp_aux_get_team_num() {
7742   int serialized;
7743   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7744   if (team) {
7745     if (serialized > 1) {
7746       return 0; // teams region is serialized ( 1 team of 1 thread ).
7747     } else {
7748       return team->t.t_master_tid;
7749     }
7750   }
7751   return 0;
7752 }
7753 
7754 int __kmp_aux_get_num_teams() {
7755   int serialized;
7756   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7757   if (team) {
7758     if (serialized > 1) {
7759       return 1;
7760     } else {
7761       return team->t.t_parent->t.t_nproc;
7762     }
7763   }
7764   return 1;
7765 }
7766 
7767 /* ------------------------------------------------------------------------ */
7768 
7769 /*
7770  * Affinity Format Parser
7771  *
7772  * Field is in form of: %[[[0].]size]type
7773  * % and type are required (%% means print a literal '%')
7774  * type is either single char or long name surrounded by {},
7775  * e.g., N or {num_threads}
7776  * 0 => leading zeros
7777  * . => right justified when size is specified
7778  * by default output is left justified
7779  * size is the *minimum* field length
7780  * All other characters are printed as is
7781  *
7782  * Available field types:
7783  * L {thread_level}      - omp_get_level()
7784  * n {thread_num}        - omp_get_thread_num()
7785  * h {host}              - name of host machine
7786  * P {process_id}        - process id (integer)
7787  * T {thread_identifier} - native thread identifier (integer)
7788  * N {num_threads}       - omp_get_num_threads()
7789  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
7790  * a {thread_affinity}   - comma separated list of integers or integer ranges
7791  *                         (values of affinity mask)
7792  *
7793  * Implementation-specific field types can be added
7794  * If a type is unknown, print "undefined"
7795 */
7796 
7797 // Structure holding the short name, long name, and corresponding data type
7798 // for snprintf.  A table of these will represent the entire valid keyword
7799 // field types.
7800 typedef struct kmp_affinity_format_field_t {
7801   char short_name; // from spec e.g., L -> thread level
7802   const char *long_name; // from spec thread_level -> thread level
7803   char field_format; // data type for snprintf (typically 'd' or 's'
7804   // for integer or string)
7805 } kmp_affinity_format_field_t;
7806 
7807 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7808 #if KMP_AFFINITY_SUPPORTED
7809     {'A', "thread_affinity", 's'},
7810 #endif
7811     {'t', "team_num", 'd'},
7812     {'T', "num_teams", 'd'},
7813     {'L', "nesting_level", 'd'},
7814     {'n', "thread_num", 'd'},
7815     {'N', "num_threads", 'd'},
7816     {'a', "ancestor_tnum", 'd'},
7817     {'H', "host", 's'},
7818     {'P', "process_id", 'd'},
7819     {'i', "native_thread_id", 'd'}};
7820 
7821 // Return the number of characters it takes to hold field
7822 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7823                                             const char **ptr,
7824                                             kmp_str_buf_t *field_buffer) {
7825   int rc, format_index, field_value;
7826   const char *width_left, *width_right;
7827   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7828   static const int FORMAT_SIZE = 20;
7829   char format[FORMAT_SIZE] = {0};
7830   char absolute_short_name = 0;
7831 
7832   KMP_DEBUG_ASSERT(gtid >= 0);
7833   KMP_DEBUG_ASSERT(th);
7834   KMP_DEBUG_ASSERT(**ptr == '%');
7835   KMP_DEBUG_ASSERT(field_buffer);
7836 
7837   __kmp_str_buf_clear(field_buffer);
7838 
7839   // Skip the initial %
7840   (*ptr)++;
7841 
7842   // Check for %% first
7843   if (**ptr == '%') {
7844     __kmp_str_buf_cat(field_buffer, "%", 1);
7845     (*ptr)++; // skip over the second %
7846     return 1;
7847   }
7848 
7849   // Parse field modifiers if they are present
7850   pad_zeros = false;
7851   if (**ptr == '0') {
7852     pad_zeros = true;
7853     (*ptr)++; // skip over 0
7854   }
7855   right_justify = false;
7856   if (**ptr == '.') {
7857     right_justify = true;
7858     (*ptr)++; // skip over .
7859   }
7860   // Parse width of field: [width_left, width_right)
7861   width_left = width_right = NULL;
7862   if (**ptr >= '0' && **ptr <= '9') {
7863     width_left = *ptr;
7864     SKIP_DIGITS(*ptr);
7865     width_right = *ptr;
7866   }
7867 
7868   // Create the format for KMP_SNPRINTF based on flags parsed above
7869   format_index = 0;
7870   format[format_index++] = '%';
7871   if (!right_justify)
7872     format[format_index++] = '-';
7873   if (pad_zeros)
7874     format[format_index++] = '0';
7875   if (width_left && width_right) {
7876     int i = 0;
7877     // Only allow 8 digit number widths.
7878     // This also prevents overflowing format variable
7879     while (i < 8 && width_left < width_right) {
7880       format[format_index++] = *width_left;
7881       width_left++;
7882       i++;
7883     }
7884   }
7885 
7886   // Parse a name (long or short)
7887   // Canonicalize the name into absolute_short_name
7888   found_valid_name = false;
7889   parse_long_name = (**ptr == '{');
7890   if (parse_long_name)
7891     (*ptr)++; // skip initial left brace
7892   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7893                              sizeof(__kmp_affinity_format_table[0]);
7894        ++i) {
7895     char short_name = __kmp_affinity_format_table[i].short_name;
7896     const char *long_name = __kmp_affinity_format_table[i].long_name;
7897     char field_format = __kmp_affinity_format_table[i].field_format;
7898     if (parse_long_name) {
7899       int length = KMP_STRLEN(long_name);
7900       if (strncmp(*ptr, long_name, length) == 0) {
7901         found_valid_name = true;
7902         (*ptr) += length; // skip the long name
7903       }
7904     } else if (**ptr == short_name) {
7905       found_valid_name = true;
7906       (*ptr)++; // skip the short name
7907     }
7908     if (found_valid_name) {
7909       format[format_index++] = field_format;
7910       format[format_index++] = '\0';
7911       absolute_short_name = short_name;
7912       break;
7913     }
7914   }
7915   if (parse_long_name) {
7916     if (**ptr != '}') {
7917       absolute_short_name = 0;
7918     } else {
7919       (*ptr)++; // skip over the right brace
7920     }
7921   }
7922 
7923   // Attempt to fill the buffer with the requested
7924   // value using snprintf within __kmp_str_buf_print()
7925   switch (absolute_short_name) {
7926   case 't':
7927     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7928     break;
7929   case 'T':
7930     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7931     break;
7932   case 'L':
7933     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7934     break;
7935   case 'n':
7936     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7937     break;
7938   case 'H': {
7939     static const int BUFFER_SIZE = 256;
7940     char buf[BUFFER_SIZE];
7941     __kmp_expand_host_name(buf, BUFFER_SIZE);
7942     rc = __kmp_str_buf_print(field_buffer, format, buf);
7943   } break;
7944   case 'P':
7945     rc = __kmp_str_buf_print(field_buffer, format, getpid());
7946     break;
7947   case 'i':
7948     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7949     break;
7950   case 'N':
7951     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7952     break;
7953   case 'a':
7954     field_value =
7955         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7956     rc = __kmp_str_buf_print(field_buffer, format, field_value);
7957     break;
7958 #if KMP_AFFINITY_SUPPORTED
7959   case 'A': {
7960     kmp_str_buf_t buf;
7961     __kmp_str_buf_init(&buf);
7962     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7963     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7964     __kmp_str_buf_free(&buf);
7965   } break;
7966 #endif
7967   default:
7968     // According to spec, If an implementation does not have info for field
7969     // type, then "undefined" is printed
7970     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7971     // Skip the field
7972     if (parse_long_name) {
7973       SKIP_TOKEN(*ptr);
7974       if (**ptr == '}')
7975         (*ptr)++;
7976     } else {
7977       (*ptr)++;
7978     }
7979   }
7980 
7981   KMP_ASSERT(format_index <= FORMAT_SIZE);
7982   return rc;
7983 }
7984 
7985 /*
7986  * Return number of characters needed to hold the affinity string
7987  * (not including null byte character)
7988  * The resultant string is printed to buffer, which the caller can then
7989  * handle afterwards
7990 */
7991 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7992                                   kmp_str_buf_t *buffer) {
7993   const char *parse_ptr;
7994   size_t retval;
7995   const kmp_info_t *th;
7996   kmp_str_buf_t field;
7997 
7998   KMP_DEBUG_ASSERT(buffer);
7999   KMP_DEBUG_ASSERT(gtid >= 0);
8000 
8001   __kmp_str_buf_init(&field);
8002   __kmp_str_buf_clear(buffer);
8003 
8004   th = __kmp_threads[gtid];
8005   retval = 0;
8006 
8007   // If format is NULL or zero-length string, then we use
8008   // affinity-format-var ICV
8009   parse_ptr = format;
8010   if (parse_ptr == NULL || *parse_ptr == '\0') {
8011     parse_ptr = __kmp_affinity_format;
8012   }
8013   KMP_DEBUG_ASSERT(parse_ptr);
8014 
8015   while (*parse_ptr != '\0') {
8016     // Parse a field
8017     if (*parse_ptr == '%') {
8018       // Put field in the buffer
8019       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8020       __kmp_str_buf_catbuf(buffer, &field);
8021       retval += rc;
8022     } else {
8023       // Put literal character in buffer
8024       __kmp_str_buf_cat(buffer, parse_ptr, 1);
8025       retval++;
8026       parse_ptr++;
8027     }
8028   }
8029   __kmp_str_buf_free(&field);
8030   return retval;
8031 }
8032 
8033 // Displays the affinity string to stdout
8034 void __kmp_aux_display_affinity(int gtid, const char *format) {
8035   kmp_str_buf_t buf;
8036   __kmp_str_buf_init(&buf);
8037   __kmp_aux_capture_affinity(gtid, format, &buf);
8038   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8039   __kmp_str_buf_free(&buf);
8040 }
8041 
8042 /* ------------------------------------------------------------------------ */
8043 
8044 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8045   int blocktime = arg; /* argument is in milliseconds */
8046 #if KMP_USE_MONITOR
8047   int bt_intervals;
8048 #endif
8049   int bt_set;
8050 
8051   __kmp_save_internal_controls(thread);
8052 
8053   /* Normalize and set blocktime for the teams */
8054   if (blocktime < KMP_MIN_BLOCKTIME)
8055     blocktime = KMP_MIN_BLOCKTIME;
8056   else if (blocktime > KMP_MAX_BLOCKTIME)
8057     blocktime = KMP_MAX_BLOCKTIME;
8058 
8059   set__blocktime_team(thread->th.th_team, tid, blocktime);
8060   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8061 
8062 #if KMP_USE_MONITOR
8063   /* Calculate and set blocktime intervals for the teams */
8064   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8065 
8066   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8067   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8068 #endif
8069 
8070   /* Set whether blocktime has been set to "TRUE" */
8071   bt_set = TRUE;
8072 
8073   set__bt_set_team(thread->th.th_team, tid, bt_set);
8074   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8075 #if KMP_USE_MONITOR
8076   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8077                 "bt_intervals=%d, monitor_updates=%d\n",
8078                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8079                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8080                 __kmp_monitor_wakeups));
8081 #else
8082   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8083                 __kmp_gtid_from_tid(tid, thread->th.th_team),
8084                 thread->th.th_team->t.t_id, tid, blocktime));
8085 #endif
8086 }
8087 
8088 void __kmp_aux_set_defaults(char const *str, int len) {
8089   if (!__kmp_init_serial) {
8090     __kmp_serial_initialize();
8091   }
8092   __kmp_env_initialize(str);
8093 
8094   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8095     __kmp_env_print();
8096   }
8097 } // __kmp_aux_set_defaults
8098 
8099 /* ------------------------------------------------------------------------ */
8100 /* internal fast reduction routines */
8101 
8102 PACKED_REDUCTION_METHOD_T
8103 __kmp_determine_reduction_method(
8104     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8105     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8106     kmp_critical_name *lck) {
8107 
8108   // Default reduction method: critical construct ( lck != NULL, like in current
8109   // PAROPT )
8110   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8111   // can be selected by RTL
8112   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8113   // can be selected by RTL
8114   // Finally, it's up to OpenMP RTL to make a decision on which method to select
8115   // among generated by PAROPT.
8116 
8117   PACKED_REDUCTION_METHOD_T retval;
8118 
8119   int team_size;
8120 
8121   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8122   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8123 
8124 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8125   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8126 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8127 
8128   retval = critical_reduce_block;
8129 
8130   // another choice of getting a team size (with 1 dynamic deference) is slower
8131   team_size = __kmp_get_team_num_threads(global_tid);
8132   if (team_size == 1) {
8133 
8134     retval = empty_reduce_block;
8135 
8136   } else {
8137 
8138     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8139 
8140 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8141     KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8142 
8143 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8144     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8145 
8146     int teamsize_cutoff = 4;
8147 
8148 #if KMP_MIC_SUPPORTED
8149     if (__kmp_mic_type != non_mic) {
8150       teamsize_cutoff = 8;
8151     }
8152 #endif
8153     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8154     if (tree_available) {
8155       if (team_size <= teamsize_cutoff) {
8156         if (atomic_available) {
8157           retval = atomic_reduce_block;
8158         }
8159       } else {
8160         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8161       }
8162     } else if (atomic_available) {
8163       retval = atomic_reduce_block;
8164     }
8165 #else
8166 #error "Unknown or unsupported OS"
8167 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8168        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8169 
8170 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8171 
8172 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8173 
8174     // basic tuning
8175 
8176     if (atomic_available) {
8177       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8178         retval = atomic_reduce_block;
8179       }
8180     } // otherwise: use critical section
8181 
8182 #elif KMP_OS_DARWIN
8183 
8184     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8185     if (atomic_available && (num_vars <= 3)) {
8186       retval = atomic_reduce_block;
8187     } else if (tree_available) {
8188       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8189           (reduce_size < (2000 * sizeof(kmp_real64)))) {
8190         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8191       }
8192     } // otherwise: use critical section
8193 
8194 #else
8195 #error "Unknown or unsupported OS"
8196 #endif
8197 
8198 #else
8199 #error "Unknown or unsupported architecture"
8200 #endif
8201   }
8202 
8203   // KMP_FORCE_REDUCTION
8204 
8205   // If the team is serialized (team_size == 1), ignore the forced reduction
8206   // method and stay with the unsynchronized method (empty_reduce_block)
8207   if (__kmp_force_reduction_method != reduction_method_not_defined &&
8208       team_size != 1) {
8209 
8210     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8211 
8212     int atomic_available, tree_available;
8213 
8214     switch ((forced_retval = __kmp_force_reduction_method)) {
8215     case critical_reduce_block:
8216       KMP_ASSERT(lck); // lck should be != 0
8217       break;
8218 
8219     case atomic_reduce_block:
8220       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8221       if (!atomic_available) {
8222         KMP_WARNING(RedMethodNotSupported, "atomic");
8223         forced_retval = critical_reduce_block;
8224       }
8225       break;
8226 
8227     case tree_reduce_block:
8228       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8229       if (!tree_available) {
8230         KMP_WARNING(RedMethodNotSupported, "tree");
8231         forced_retval = critical_reduce_block;
8232       } else {
8233 #if KMP_FAST_REDUCTION_BARRIER
8234         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8235 #endif
8236       }
8237       break;
8238 
8239     default:
8240       KMP_ASSERT(0); // "unsupported method specified"
8241     }
8242 
8243     retval = forced_retval;
8244   }
8245 
8246   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8247 
8248 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8249 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8250 
8251   return (retval);
8252 }
8253 // this function is for testing set/get/determine reduce method
8254 kmp_int32 __kmp_get_reduce_method(void) {
8255   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8256 }
8257 
8258 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8259 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8260 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8261 
8262 // Hard pause shuts down the runtime completely.  Resume happens naturally when
8263 // OpenMP is used subsequently.
8264 void __kmp_hard_pause() {
8265   __kmp_pause_status = kmp_hard_paused;
8266   __kmp_internal_end_thread(-1);
8267 }
8268 
8269 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8270 void __kmp_resume_if_soft_paused() {
8271   if (__kmp_pause_status == kmp_soft_paused) {
8272     __kmp_pause_status = kmp_not_paused;
8273 
8274     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8275       kmp_info_t *thread = __kmp_threads[gtid];
8276       if (thread) { // Wake it if sleeping
8277         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8278         if (fl.is_sleeping())
8279           fl.resume(gtid);
8280         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8281           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8282         } else { // thread holds the lock and may sleep soon
8283           do { // until either the thread sleeps, or we can get the lock
8284             if (fl.is_sleeping()) {
8285               fl.resume(gtid);
8286               break;
8287             } else if (__kmp_try_suspend_mx(thread)) {
8288               __kmp_unlock_suspend_mx(thread);
8289               break;
8290             }
8291           } while (1);
8292         }
8293       }
8294     }
8295   }
8296 }
8297 
8298 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8299 // TODO: add warning messages
8300 int __kmp_pause_resource(kmp_pause_status_t level) {
8301   if (level == kmp_not_paused) { // requesting resume
8302     if (__kmp_pause_status == kmp_not_paused) {
8303       // error message about runtime not being paused, so can't resume
8304       return 1;
8305     } else {
8306       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8307                        __kmp_pause_status == kmp_hard_paused);
8308       __kmp_pause_status = kmp_not_paused;
8309       return 0;
8310     }
8311   } else if (level == kmp_soft_paused) { // requesting soft pause
8312     if (__kmp_pause_status != kmp_not_paused) {
8313       // error message about already being paused
8314       return 1;
8315     } else {
8316       __kmp_soft_pause();
8317       return 0;
8318     }
8319   } else if (level == kmp_hard_paused) { // requesting hard pause
8320     if (__kmp_pause_status != kmp_not_paused) {
8321       // error message about already being paused
8322       return 1;
8323     } else {
8324       __kmp_hard_pause();
8325       return 0;
8326     }
8327   } else {
8328     // error message about invalid level
8329     return 1;
8330   }
8331 }
8332 
8333 
8334 void __kmp_omp_display_env(int verbose) {
8335   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8336   if (__kmp_init_serial == 0)
8337     __kmp_do_serial_initialize();
8338   __kmp_display_env_impl(!verbose, verbose);
8339   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8340 }
8341