1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_affinity.h"
16 #include "kmp_atomic.h"
17 #include "kmp_environment.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_io.h"
21 #include "kmp_itt.h"
22 #include "kmp_settings.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #include "kmp_wait_release.h"
26 #include "kmp_wrapper_getpid.h"
27 
28 #if OMPT_SUPPORT
29 #include "ompt-specific.h"
30 #endif
31 
32 /* these are temporary issues to be dealt with */
33 #define KMP_USE_PRCTL 0
34 
35 #if KMP_OS_WINDOWS
36 #include <process.h>
37 #endif
38 
39 #include "tsan_annotations.h"
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] =
43     KMP_VERSION_PREFIX "alternative compiler support: yes";
44 #endif /* defined(KMP_GOMP_COMPAT) */
45 
46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
47 #if OMP_50_ENABLED
48                                                         "5.0 (201611)";
49 #elif OMP_45_ENABLED
50                                                         "4.5 (201511)";
51 #elif OMP_40_ENABLED
52                                                         "4.0 (201307)";
53 #else
54                                                         "3.1 (201107)";
55 #endif
56 
57 #ifdef KMP_DEBUG
58 char const __kmp_version_lock[] =
59     KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
63 
64 /* ------------------------------------------------------------------------ */
65 
66 kmp_info_t __kmp_monitor;
67 
68 /* Forward declarations */
69 
70 void __kmp_cleanup(void);
71 
72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
73                                   int gtid);
74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
75                                   kmp_internal_control_t *new_icvs,
76                                   ident_t *loc);
77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
78 static void __kmp_partition_places(kmp_team_t *team,
79                                    int update_master_only = 0);
80 #endif
81 static void __kmp_do_serial_initialize(void);
82 void __kmp_fork_barrier(int gtid, int tid);
83 void __kmp_join_barrier(int gtid);
84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
85                           kmp_internal_control_t *new_icvs, ident_t *loc);
86 
87 #ifdef USE_LOAD_BALANCE
88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
89 #endif
90 
91 static int __kmp_expand_threads(int nNeed);
92 #if KMP_OS_WINDOWS
93 static int __kmp_unregister_root_other_thread(int gtid);
94 #endif
95 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
97 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique identifier of executing
101    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
102 int __kmp_get_global_thread_id() {
103   int i;
104   kmp_info_t **other_threads;
105   size_t stack_data;
106   char *stack_addr;
107   size_t stack_size;
108   char *stack_base;
109 
110   KA_TRACE(
111       1000,
112       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
113        __kmp_nth, __kmp_all_nth));
114 
115   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
116      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
117      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
118      __kmp_init_gtid for this to work. */
119 
120   if (!TCR_4(__kmp_init_gtid))
121     return KMP_GTID_DNE;
122 
123 #ifdef KMP_TDATA_GTID
124   if (TCR_4(__kmp_gtid_mode) >= 3) {
125     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126     return __kmp_gtid;
127   }
128 #endif
129   if (TCR_4(__kmp_gtid_mode) >= 2) {
130     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
131     return __kmp_gtid_get_specific();
132   }
133   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
134 
135   stack_addr = (char *)&stack_data;
136   other_threads = __kmp_threads;
137 
138   /* ATT: The code below is a source of potential bugs due to unsynchronized
139      access to __kmp_threads array. For example:
140      1. Current thread loads other_threads[i] to thr and checks it, it is
141         non-NULL.
142      2. Current thread is suspended by OS.
143      3. Another thread unregisters and finishes (debug versions of free()
144         may fill memory with something like 0xEF).
145      4. Current thread is resumed.
146      5. Current thread reads junk from *thr.
147      TODO: Fix it.  --ln  */
148 
149   for (i = 0; i < __kmp_threads_capacity; i++) {
150 
151     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152     if (!thr)
153       continue;
154 
155     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
156     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
157 
158     /* stack grows down -- search through all of the active threads */
159 
160     if (stack_addr <= stack_base) {
161       size_t stack_diff = stack_base - stack_addr;
162 
163       if (stack_diff <= stack_size) {
164         /* The only way we can be closer than the allocated */
165         /* stack size is if we are running on this thread. */
166         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
167         return i;
168       }
169     }
170   }
171 
172   /* get specific to try and determine our gtid */
173   KA_TRACE(1000,
174            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
175             "thread, using TLS\n"));
176   i = __kmp_gtid_get_specific();
177 
178   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
179 
180   /* if we havn't been assigned a gtid, then return code */
181   if (i < 0)
182     return i;
183 
184   /* dynamically updated stack window for uber threads to avoid get_specific
185      call */
186   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
187     KMP_FATAL(StackOverflow, i);
188   }
189 
190   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
191   if (stack_addr > stack_base) {
192     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
193     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
194             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
195                 stack_base);
196   } else {
197     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
198             stack_base - stack_addr);
199   }
200 
201   /* Reprint stack bounds for ubermaster since they have been refined */
202   if (__kmp_storage_map) {
203     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
204     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
205     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
206                                  other_threads[i]->th.th_info.ds.ds_stacksize,
207                                  "th_%d stack (refinement)", i);
208   }
209   return i;
210 }
211 
212 int __kmp_get_global_thread_id_reg() {
213   int gtid;
214 
215   if (!__kmp_init_serial) {
216     gtid = KMP_GTID_DNE;
217   } else
218 #ifdef KMP_TDATA_GTID
219       if (TCR_4(__kmp_gtid_mode) >= 3) {
220     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221     gtid = __kmp_gtid;
222   } else
223 #endif
224       if (TCR_4(__kmp_gtid_mode) >= 2) {
225     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
226     gtid = __kmp_gtid_get_specific();
227   } else {
228     KA_TRACE(1000,
229              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
230     gtid = __kmp_get_global_thread_id();
231   }
232 
233   /* we must be a new uber master sibling thread */
234   if (gtid == KMP_GTID_DNE) {
235     KA_TRACE(10,
236              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
237               "Registering a new gtid.\n"));
238     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
239     if (!__kmp_init_serial) {
240       __kmp_do_serial_initialize();
241       gtid = __kmp_gtid_get_specific();
242     } else {
243       gtid = __kmp_register_root(FALSE);
244     }
245     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
246     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
247   }
248 
249   KMP_DEBUG_ASSERT(gtid >= 0);
250 
251   return gtid;
252 }
253 
254 /* caller must hold forkjoin_lock */
255 void __kmp_check_stack_overlap(kmp_info_t *th) {
256   int f;
257   char *stack_beg = NULL;
258   char *stack_end = NULL;
259   int gtid;
260 
261   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
262   if (__kmp_storage_map) {
263     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
264     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
265 
266     gtid = __kmp_gtid_from_thread(th);
267 
268     if (gtid == KMP_GTID_MONITOR) {
269       __kmp_print_storage_map_gtid(
270           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271           "th_%s stack (%s)", "mon",
272           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
273     } else {
274       __kmp_print_storage_map_gtid(
275           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276           "th_%d stack (%s)", gtid,
277           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278     }
279   }
280 
281   /* No point in checking ubermaster threads since they use refinement and
282    * cannot overlap */
283   gtid = __kmp_gtid_from_thread(th);
284   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
285     KA_TRACE(10,
286              ("__kmp_check_stack_overlap: performing extensive checking\n"));
287     if (stack_beg == NULL) {
288       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
289       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
290     }
291 
292     for (f = 0; f < __kmp_threads_capacity; f++) {
293       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
294 
295       if (f_th && f_th != th) {
296         char *other_stack_end =
297             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
298         char *other_stack_beg =
299             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
300         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
301             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
302 
303           /* Print the other stack values before the abort */
304           if (__kmp_storage_map)
305             __kmp_print_storage_map_gtid(
306                 -1, other_stack_beg, other_stack_end,
307                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
308                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
309 
310           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
311                       __kmp_msg_null);
312         }
313       }
314     }
315   }
316   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
317 }
318 
319 /* ------------------------------------------------------------------------ */
320 
321 void __kmp_infinite_loop(void) {
322   static int done = FALSE;
323 
324   while (!done) {
325     KMP_YIELD(1);
326   }
327 }
328 
329 #define MAX_MESSAGE 512
330 
331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
332                                   char const *format, ...) {
333   char buffer[MAX_MESSAGE];
334   va_list ap;
335 
336   va_start(ap, format);
337   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
338                p2, (unsigned long)size, format);
339   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
340   __kmp_vprintf(kmp_err, buffer, ap);
341 #if KMP_PRINT_DATA_PLACEMENT
342   int node;
343   if (gtid >= 0) {
344     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
345       if (__kmp_storage_map_verbose) {
346         node = __kmp_get_host_node(p1);
347         if (node < 0) /* doesn't work, so don't try this next time */
348           __kmp_storage_map_verbose = FALSE;
349         else {
350           char *last;
351           int lastNode;
352           int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354           const int page_size = KMP_GET_PAGE_SIZE();
355 
356           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
357           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
358           if (localProc >= 0)
359             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
360                                  localProc >> 1);
361           else
362             __kmp_printf_no_lock("  GTID %d\n", gtid);
363 #if KMP_USE_PRCTL
364           /* The more elaborate format is disabled for now because of the prctl
365            * hanging bug. */
366           do {
367             last = p1;
368             lastNode = node;
369             /* This loop collates adjacent pages with the same host node. */
370             do {
371               (char *)p1 += page_size;
372             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
373             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
374                                  lastNode);
375           } while (p1 <= p2);
376 #else
377           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
378                                (char *)p1 + (page_size - 1),
379                                __kmp_get_host_node(p1));
380           if (p1 < p2) {
381             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
382                                  (char *)p2 + (page_size - 1),
383                                  __kmp_get_host_node(p2));
384           }
385 #endif
386         }
387       }
388     } else
389       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
390   }
391 #endif /* KMP_PRINT_DATA_PLACEMENT */
392   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
393 }
394 
395 void __kmp_warn(char const *format, ...) {
396   char buffer[MAX_MESSAGE];
397   va_list ap;
398 
399   if (__kmp_generate_warnings == kmp_warnings_off) {
400     return;
401   }
402 
403   va_start(ap, format);
404 
405   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
406   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
407   __kmp_vprintf(kmp_err, buffer, ap);
408   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409 
410   va_end(ap);
411 }
412 
413 void __kmp_abort_process() {
414   // Later threads may stall here, but that's ok because abort() will kill them.
415   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
416 
417   if (__kmp_debug_buf) {
418     __kmp_dump_debug_buffer();
419   }
420 
421   if (KMP_OS_WINDOWS) {
422     // Let other threads know of abnormal termination and prevent deadlock
423     // if abort happened during library initialization or shutdown
424     __kmp_global.g.g_abort = SIGABRT;
425 
426     /* On Windows* OS by default abort() causes pop-up error box, which stalls
427        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
428        boxes. _set_abort_behavior() works well, but this function is not
429        available in VS7 (this is not problem for DLL, but it is a problem for
430        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
431        help, at least in some versions of MS C RTL.
432 
433        It seems following sequence is the only way to simulate abort() and
434        avoid pop-up error box. */
435     raise(SIGABRT);
436     _exit(3); // Just in case, if signal ignored, exit anyway.
437   } else {
438     abort();
439   }
440 
441   __kmp_infinite_loop();
442   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
443 
444 } // __kmp_abort_process
445 
446 void __kmp_abort_thread(void) {
447   // TODO: Eliminate g_abort global variable and this function.
448   // In case of abort just call abort(), it will kill all the threads.
449   __kmp_infinite_loop();
450 } // __kmp_abort_thread
451 
452 /* Print out the storage map for the major kmp_info_t thread data structures
453    that are allocated together. */
454 
455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
456   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
457                                gtid);
458 
459   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
460                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
463                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
464 
465   __kmp_print_storage_map_gtid(
466       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
468 
469   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
470                                &thr->th.th_bar[bs_plain_barrier + 1],
471                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
472                                gtid);
473 
474   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
475                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
476                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
477                                gtid);
478 
479 #if KMP_FAST_REDUCTION_BARRIER
480   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
481                                &thr->th.th_bar[bs_reduction_barrier + 1],
482                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
483                                gtid);
484 #endif // KMP_FAST_REDUCTION_BARRIER
485 }
486 
487 /* Print out the storage map for the major kmp_team_t team data structures
488    that are allocated together. */
489 
490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
491                                          int team_id, int num_thr) {
492   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494                                header, team_id);
495 
496   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
497                                &team->t.t_bar[bs_last_barrier],
498                                sizeof(kmp_balign_team_t) * bs_last_barrier,
499                                "%s_%d.t_bar", header, team_id);
500 
501   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
502                                &team->t.t_bar[bs_plain_barrier + 1],
503                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
504                                header, team_id);
505 
506   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
507                                &team->t.t_bar[bs_forkjoin_barrier + 1],
508                                sizeof(kmp_balign_team_t),
509                                "%s_%d.t_bar[forkjoin]", header, team_id);
510 
511 #if KMP_FAST_REDUCTION_BARRIER
512   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
513                                &team->t.t_bar[bs_reduction_barrier + 1],
514                                sizeof(kmp_balign_team_t),
515                                "%s_%d.t_bar[reduction]", header, team_id);
516 #endif // KMP_FAST_REDUCTION_BARRIER
517 
518   __kmp_print_storage_map_gtid(
519       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
521 
522   __kmp_print_storage_map_gtid(
523       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
524       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
525 
526   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
527                                &team->t.t_disp_buffer[num_disp_buff],
528                                sizeof(dispatch_shared_info_t) * num_disp_buff,
529                                "%s_%d.t_disp_buffer", header, team_id);
530 
531   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
532                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
533                                team_id);
534 }
535 
536 static void __kmp_init_allocator() {}
537 static void __kmp_fini_allocator() {}
538 
539 /* ------------------------------------------------------------------------ */
540 
541 #ifdef KMP_DYNAMIC_LIB
542 #if KMP_OS_WINDOWS
543 
544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
545   // TODO: Change to __kmp_break_bootstrap_lock().
546   __kmp_init_bootstrap_lock(lck); // make the lock released
547 }
548 
549 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
550   int i;
551   int thread_count;
552 
553   // PROCESS_DETACH is expected to be called by a thread that executes
554   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
555   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
556   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
557   // threads can be still alive here, although being about to be terminated. The
558   // threads in the array with ds_thread==0 are most suspicious. Actually, it
559   // can be not safe to access the __kmp_threads[].
560 
561   // TODO: does it make sense to check __kmp_roots[] ?
562 
563   // Let's check that there are no other alive threads registered with the OMP
564   // lib.
565   while (1) {
566     thread_count = 0;
567     for (i = 0; i < __kmp_threads_capacity; ++i) {
568       if (!__kmp_threads)
569         continue;
570       kmp_info_t *th = __kmp_threads[i];
571       if (th == NULL)
572         continue;
573       int gtid = th->th.th_info.ds.ds_gtid;
574       if (gtid == gtid_req)
575         continue;
576       if (gtid < 0)
577         continue;
578       DWORD exit_val;
579       int alive = __kmp_is_thread_alive(th, &exit_val);
580       if (alive) {
581         ++thread_count;
582       }
583     }
584     if (thread_count == 0)
585       break; // success
586   }
587 
588   // Assume that I'm alone. Now it might be safe to check and reset locks.
589   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
590   __kmp_reset_lock(&__kmp_forkjoin_lock);
591 #ifdef KMP_DEBUG
592   __kmp_reset_lock(&__kmp_stdio_lock);
593 #endif // KMP_DEBUG
594 }
595 
596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
597   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
598 
599   switch (fdwReason) {
600 
601   case DLL_PROCESS_ATTACH:
602     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
603 
604     return TRUE;
605 
606   case DLL_PROCESS_DETACH:
607     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
608 
609     if (lpReserved != NULL) {
610       // lpReserved is used for telling the difference:
611       //   lpReserved == NULL when FreeLibrary() was called,
612       //   lpReserved != NULL when the process terminates.
613       // When FreeLibrary() is called, worker threads remain alive. So they will
614       // release the forkjoin lock by themselves. When the process terminates,
615       // worker threads disappear triggering the problem of unreleased forkjoin
616       // lock as described below.
617 
618       // A worker thread can take the forkjoin lock. The problem comes up if
619       // that worker thread becomes dead before it releases the forkjoin lock.
620       // The forkjoin lock remains taken, while the thread executing
621       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
622       // to take the forkjoin lock and will always fail, so that the application
623       // will never finish [normally]. This scenario is possible if
624       // __kmpc_end() has not been executed. It looks like it's not a corner
625       // case, but common cases:
626       // - the main function was compiled by an alternative compiler;
627       // - the main function was compiled by icl but without /Qopenmp
628       //   (application with plugins);
629       // - application terminates by calling C exit(), Fortran CALL EXIT() or
630       //   Fortran STOP.
631       // - alive foreign thread prevented __kmpc_end from doing cleanup.
632       //
633       // This is a hack to work around the problem.
634       // TODO: !!! figure out something better.
635       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
636     }
637 
638     __kmp_internal_end_library(__kmp_gtid_get_specific());
639 
640     return TRUE;
641 
642   case DLL_THREAD_ATTACH:
643     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
644 
645     /* if we want to register new siblings all the time here call
646      * __kmp_get_gtid(); */
647     return TRUE;
648 
649   case DLL_THREAD_DETACH:
650     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
651 
652     __kmp_internal_end_thread(__kmp_gtid_get_specific());
653     return TRUE;
654   }
655 
656   return TRUE;
657 }
658 
659 #endif /* KMP_OS_WINDOWS */
660 #endif /* KMP_DYNAMIC_LIB */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int __kmp_change_library(int status) {
665   int old_status;
666 
667   old_status = __kmp_yield_init &
668                1; // check whether KMP_LIBRARY=throughput (even init count)
669 
670   if (status) {
671     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
672   } else {
673     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
674   }
675 
676   return old_status; // return previous setting of whether
677   // KMP_LIBRARY=throughput
678 }
679 
680 /* __kmp_parallel_deo -- Wait until it's our turn. */
681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682   int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684   kmp_team_t *team = __kmp_team_from_gtid(gtid);
685 #endif /* BUILD_PARALLEL_ORDERED */
686 
687   if (__kmp_env_consistency_check) {
688     if (__kmp_threads[gtid]->th.th_root->r.r_active)
689 #if KMP_USE_DYNAMIC_LOCK
690       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
691 #else
692       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
693 #endif
694   }
695 #ifdef BUILD_PARALLEL_ORDERED
696   if (!team->t.t_serialized) {
697     KMP_MB();
698     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
699                    KMP_EQ, NULL);
700     KMP_MB();
701   }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* __kmp_parallel_dxo -- Signal the next task. */
706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
707   int gtid = *gtid_ref;
708 #ifdef BUILD_PARALLEL_ORDERED
709   int tid = __kmp_tid_from_gtid(gtid);
710   kmp_team_t *team = __kmp_team_from_gtid(gtid);
711 #endif /* BUILD_PARALLEL_ORDERED */
712 
713   if (__kmp_env_consistency_check) {
714     if (__kmp_threads[gtid]->th.th_root->r.r_active)
715       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
716   }
717 #ifdef BUILD_PARALLEL_ORDERED
718   if (!team->t.t_serialized) {
719     KMP_MB(); /* Flush all pending memory write invalidates.  */
720 
721     /* use the tid of the next thread in this team */
722     /* TODO replace with general release procedure */
723     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
724 
725     KMP_MB(); /* Flush all pending memory write invalidates.  */
726   }
727 #endif /* BUILD_PARALLEL_ORDERED */
728 }
729 
730 /* ------------------------------------------------------------------------ */
731 /* The BARRIER for a SINGLE process section is always explicit   */
732 
733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
734   int status;
735   kmp_info_t *th;
736   kmp_team_t *team;
737 
738   if (!TCR_4(__kmp_init_parallel))
739     __kmp_parallel_initialize();
740 
741   th = __kmp_threads[gtid];
742   team = th->th.th_team;
743   status = 0;
744 
745   th->th.th_ident = id_ref;
746 
747   if (team->t.t_serialized) {
748     status = 1;
749   } else {
750     kmp_int32 old_this = th->th.th_local.this_construct;
751 
752     ++th->th.th_local.this_construct;
753     /* try to set team count to thread count--success means thread got the
754        single block */
755     /* TODO: Should this be acquire or release? */
756     if (team->t.t_construct == old_this) {
757       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
758                                            th->th.th_local.this_construct);
759     }
760 #if USE_ITT_BUILD
761     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
762         KMP_MASTER_GTID(gtid) &&
763 #if OMP_40_ENABLED
764         th->th.th_teams_microtask == NULL &&
765 #endif
766         team->t.t_active_level ==
767             1) { // Only report metadata by master of active team at level 1
768       __kmp_itt_metadata_single(id_ref);
769     }
770 #endif /* USE_ITT_BUILD */
771   }
772 
773   if (__kmp_env_consistency_check) {
774     if (status && push_ws) {
775       __kmp_push_workshare(gtid, ct_psingle, id_ref);
776     } else {
777       __kmp_check_workshare(gtid, ct_psingle, id_ref);
778     }
779   }
780 #if USE_ITT_BUILD
781   if (status) {
782     __kmp_itt_single_start(gtid);
783   }
784 #endif /* USE_ITT_BUILD */
785   return status;
786 }
787 
788 void __kmp_exit_single(int gtid) {
789 #if USE_ITT_BUILD
790   __kmp_itt_single_end(gtid);
791 #endif /* USE_ITT_BUILD */
792   if (__kmp_env_consistency_check)
793     __kmp_pop_workshare(gtid, ct_psingle, NULL);
794 }
795 
796 /* determine if we can go parallel or must use a serialized parallel region and
797  * how many threads we can use
798  * set_nproc is the number of threads requested for the team
799  * returns 0 if we should serialize or only use one thread,
800  * otherwise the number of threads to use
801  * The forkjoin lock is held by the caller. */
802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
803                                  int master_tid, int set_nthreads
804 #if OMP_40_ENABLED
805                                  ,
806                                  int enter_teams
807 #endif /* OMP_40_ENABLED */
808                                  ) {
809   int capacity;
810   int new_nthreads;
811   KMP_DEBUG_ASSERT(__kmp_init_serial);
812   KMP_DEBUG_ASSERT(root && parent_team);
813 
814   // If dyn-var is set, dynamically adjust the number of desired threads,
815   // according to the method specified by dynamic_mode.
816   new_nthreads = set_nthreads;
817   if (!get__dynamic_2(parent_team, master_tid)) {
818     ;
819   }
820 #ifdef USE_LOAD_BALANCE
821   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
822     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
823     if (new_nthreads == 1) {
824       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
825                     "reservation to 1 thread\n",
826                     master_tid));
827       return 1;
828     }
829     if (new_nthreads < set_nthreads) {
830       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
831                     "reservation to %d threads\n",
832                     master_tid, new_nthreads));
833     }
834   }
835 #endif /* USE_LOAD_BALANCE */
836   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
837     new_nthreads = __kmp_avail_proc - __kmp_nth +
838                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
839     if (new_nthreads <= 1) {
840       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
841                     "reservation to 1 thread\n",
842                     master_tid));
843       return 1;
844     }
845     if (new_nthreads < set_nthreads) {
846       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
847                     "reservation to %d threads\n",
848                     master_tid, new_nthreads));
849     } else {
850       new_nthreads = set_nthreads;
851     }
852   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
853     if (set_nthreads > 2) {
854       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
855       new_nthreads = (new_nthreads % set_nthreads) + 1;
856       if (new_nthreads == 1) {
857         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
858                       "reservation to 1 thread\n",
859                       master_tid));
860         return 1;
861       }
862       if (new_nthreads < set_nthreads) {
863         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
864                       "reservation to %d threads\n",
865                       master_tid, new_nthreads));
866       }
867     }
868   } else {
869     KMP_ASSERT(0);
870   }
871 
872   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
873   if (__kmp_nth + new_nthreads -
874           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
875       __kmp_max_nth) {
876     int tl_nthreads = __kmp_max_nth - __kmp_nth +
877                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
878     if (tl_nthreads <= 0) {
879       tl_nthreads = 1;
880     }
881 
882     // If dyn-var is false, emit a 1-time warning.
883     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884       __kmp_reserve_warn = 1;
885       __kmp_msg(kmp_ms_warning,
886                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
887                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
888     }
889     if (tl_nthreads == 1) {
890       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
891                     "reduced reservation to 1 thread\n",
892                     master_tid));
893       return 1;
894     }
895     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
896                   "reservation to %d threads\n",
897                   master_tid, tl_nthreads));
898     new_nthreads = tl_nthreads;
899   }
900 
901   // Respect OMP_THREAD_LIMIT
902   if (root->r.r_cg_nthreads + new_nthreads -
903           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
904       __kmp_cg_max_nth) {
905     int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
906                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
907     if (tl_nthreads <= 0) {
908       tl_nthreads = 1;
909     }
910 
911     // If dyn-var is false, emit a 1-time warning.
912     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
913       __kmp_reserve_warn = 1;
914       __kmp_msg(kmp_ms_warning,
915                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
916                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
917     }
918     if (tl_nthreads == 1) {
919       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
920                     "reduced reservation to 1 thread\n",
921                     master_tid));
922       return 1;
923     }
924     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
925                   "reservation to %d threads\n",
926                   master_tid, tl_nthreads));
927     new_nthreads = tl_nthreads;
928   }
929 
930   // Check if the threads array is large enough, or needs expanding.
931   // See comment in __kmp_register_root() about the adjustment if
932   // __kmp_threads[0] == NULL.
933   capacity = __kmp_threads_capacity;
934   if (TCR_PTR(__kmp_threads[0]) == NULL) {
935     --capacity;
936   }
937   if (__kmp_nth + new_nthreads -
938           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
939       capacity) {
940     // Expand the threads array.
941     int slotsRequired = __kmp_nth + new_nthreads -
942                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
943                         capacity;
944     int slotsAdded = __kmp_expand_threads(slotsRequired);
945     if (slotsAdded < slotsRequired) {
946       // The threads array was not expanded enough.
947       new_nthreads -= (slotsRequired - slotsAdded);
948       KMP_ASSERT(new_nthreads >= 1);
949 
950       // If dyn-var is false, emit a 1-time warning.
951       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
952         __kmp_reserve_warn = 1;
953         if (__kmp_tp_cached) {
954           __kmp_msg(kmp_ms_warning,
955                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
956                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
957                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
958         } else {
959           __kmp_msg(kmp_ms_warning,
960                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
961                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
962         }
963       }
964     }
965   }
966 
967 #ifdef KMP_DEBUG
968   if (new_nthreads == 1) {
969     KC_TRACE(10,
970              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
971               "dead roots and rechecking; requested %d threads\n",
972               __kmp_get_gtid(), set_nthreads));
973   } else {
974     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
975                   " %d threads\n",
976                   __kmp_get_gtid(), new_nthreads, set_nthreads));
977   }
978 #endif // KMP_DEBUG
979   return new_nthreads;
980 }
981 
982 /* Allocate threads from the thread pool and assign them to the new team. We are
983    assured that there are enough threads available, because we checked on that
984    earlier within critical section forkjoin */
985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
986                                     kmp_info_t *master_th, int master_gtid) {
987   int i;
988   int use_hot_team;
989 
990   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
991   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
992   KMP_MB();
993 
994   /* first, let's setup the master thread */
995   master_th->th.th_info.ds.ds_tid = 0;
996   master_th->th.th_team = team;
997   master_th->th.th_team_nproc = team->t.t_nproc;
998   master_th->th.th_team_master = master_th;
999   master_th->th.th_team_serialized = FALSE;
1000   master_th->th.th_dispatch = &team->t.t_dispatch[0];
1001 
1002 /* make sure we are not the optimized hot team */
1003 #if KMP_NESTED_HOT_TEAMS
1004   use_hot_team = 0;
1005   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1006   if (hot_teams) { // hot teams array is not allocated if
1007     // KMP_HOT_TEAMS_MAX_LEVEL=0
1008     int level = team->t.t_active_level - 1; // index in array of hot teams
1009     if (master_th->th.th_teams_microtask) { // are we inside the teams?
1010       if (master_th->th.th_teams_size.nteams > 1) {
1011         ++level; // level was not increased in teams construct for
1012         // team_of_masters
1013       }
1014       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1015           master_th->th.th_teams_level == team->t.t_level) {
1016         ++level; // level was not increased in teams construct for
1017         // team_of_workers before the parallel
1018       } // team->t.t_level will be increased inside parallel
1019     }
1020     if (level < __kmp_hot_teams_max_level) {
1021       if (hot_teams[level].hot_team) {
1022         // hot team has already been allocated for given level
1023         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1024         use_hot_team = 1; // the team is ready to use
1025       } else {
1026         use_hot_team = 0; // AC: threads are not allocated yet
1027         hot_teams[level].hot_team = team; // remember new hot team
1028         hot_teams[level].hot_team_nth = team->t.t_nproc;
1029       }
1030     } else {
1031       use_hot_team = 0;
1032     }
1033   }
1034 #else
1035   use_hot_team = team == root->r.r_hot_team;
1036 #endif
1037   if (!use_hot_team) {
1038 
1039     /* install the master thread */
1040     team->t.t_threads[0] = master_th;
1041     __kmp_initialize_info(master_th, team, 0, master_gtid);
1042 
1043     /* now, install the worker threads */
1044     for (i = 1; i < team->t.t_nproc; i++) {
1045 
1046       /* fork or reallocate a new thread and install it in team */
1047       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1048       team->t.t_threads[i] = thr;
1049       KMP_DEBUG_ASSERT(thr);
1050       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1051       /* align team and thread arrived states */
1052       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1053                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1054                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1055                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1056                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1057                     team->t.t_bar[bs_plain_barrier].b_arrived));
1058 #if OMP_40_ENABLED
1059       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1060       thr->th.th_teams_level = master_th->th.th_teams_level;
1061       thr->th.th_teams_size = master_th->th.th_teams_size;
1062 #endif
1063       { // Initialize threads' barrier data.
1064         int b;
1065         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1066         for (b = 0; b < bs_last_barrier; ++b) {
1067           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1068           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1069 #if USE_DEBUGGER
1070           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1071 #endif
1072         }
1073       }
1074     }
1075 
1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1077     __kmp_partition_places(team);
1078 #endif
1079   }
1080 
1081   KMP_MB();
1082 }
1083 
1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1085 // Propagate any changes to the floating point control registers out to the team
1086 // We try to avoid unnecessary writes to the relevant cache line in the team
1087 // structure, so we don't make changes unless they are needed.
1088 inline static void propagateFPControl(kmp_team_t *team) {
1089   if (__kmp_inherit_fp_control) {
1090     kmp_int16 x87_fpu_control_word;
1091     kmp_uint32 mxcsr;
1092 
1093     // Get master values of FPU control flags (both X87 and vector)
1094     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1095     __kmp_store_mxcsr(&mxcsr);
1096     mxcsr &= KMP_X86_MXCSR_MASK;
1097 
1098     // There is no point looking at t_fp_control_saved here.
1099     // If it is TRUE, we still have to update the values if they are different
1100     // from those we now have. If it is FALSE we didn't save anything yet, but
1101     // our objective is the same. We have to ensure that the values in the team
1102     // are the same as those we have.
1103     // So, this code achieves what we need whether or not t_fp_control_saved is
1104     // true. By checking whether the value needs updating we avoid unnecessary
1105     // writes that would put the cache-line into a written state, causing all
1106     // threads in the team to have to read it again.
1107     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1108     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1109     // Although we don't use this value, other code in the runtime wants to know
1110     // whether it should restore them. So we must ensure it is correct.
1111     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1112   } else {
1113     // Similarly here. Don't write to this cache-line in the team structure
1114     // unless we have to.
1115     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1116   }
1117 }
1118 
1119 // Do the opposite, setting the hardware registers to the updated values from
1120 // the team.
1121 inline static void updateHWFPControl(kmp_team_t *team) {
1122   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1123     // Only reset the fp control regs if they have been changed in the team.
1124     // the parallel region that we are exiting.
1125     kmp_int16 x87_fpu_control_word;
1126     kmp_uint32 mxcsr;
1127     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128     __kmp_store_mxcsr(&mxcsr);
1129     mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1132       __kmp_clear_x87_fpu_status_word();
1133       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1134     }
1135 
1136     if (team->t.t_mxcsr != mxcsr) {
1137       __kmp_load_mxcsr(&team->t.t_mxcsr);
1138     }
1139   }
1140 }
1141 #else
1142 #define propagateFPControl(x) ((void)0)
1143 #define updateHWFPControl(x) ((void)0)
1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1145 
1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1147                                      int realloc); // forward declaration
1148 
1149 /* Run a parallel region that has been serialized, so runs only in a team of the
1150    single master thread. */
1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1152   kmp_info_t *this_thr;
1153   kmp_team_t *serial_team;
1154 
1155   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1156 
1157   /* Skip all this code for autopar serialized loops since it results in
1158      unacceptable overhead */
1159   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1160     return;
1161 
1162   if (!TCR_4(__kmp_init_parallel))
1163     __kmp_parallel_initialize();
1164 
1165   this_thr = __kmp_threads[global_tid];
1166   serial_team = this_thr->th.th_serial_team;
1167 
1168   /* utilize the serialized team held by this thread */
1169   KMP_DEBUG_ASSERT(serial_team);
1170   KMP_MB();
1171 
1172   if (__kmp_tasking_mode != tskm_immediate_exec) {
1173     KMP_DEBUG_ASSERT(
1174         this_thr->th.th_task_team ==
1175         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1176     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1177                      NULL);
1178     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1179                   "team %p, new task_team = NULL\n",
1180                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1181     this_thr->th.th_task_team = NULL;
1182   }
1183 
1184 #if OMP_40_ENABLED
1185   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1186   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1187     proc_bind = proc_bind_false;
1188   } else if (proc_bind == proc_bind_default) {
1189     // No proc_bind clause was specified, so use the current value
1190     // of proc-bind-var for this parallel region.
1191     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1192   }
1193   // Reset for next parallel region
1194   this_thr->th.th_set_proc_bind = proc_bind_default;
1195 #endif /* OMP_40_ENABLED */
1196 
1197 #if OMPT_SUPPORT
1198   ompt_data_t ompt_parallel_data;
1199   ompt_parallel_data.ptr = NULL;
1200   ompt_data_t *implicit_task_data;
1201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1202   if (ompt_enabled.enabled &&
1203       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1204 
1205     ompt_task_info_t *parent_task_info;
1206     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1207 
1208     parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1209     if (ompt_enabled.ompt_callback_parallel_begin) {
1210       int team_size = 1;
1211 
1212       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1213           &(parent_task_info->task_data), &(parent_task_info->frame),
1214           &ompt_parallel_data, team_size, ompt_invoker_program, codeptr);
1215     }
1216   }
1217 #endif // OMPT_SUPPORT
1218 
1219   if (this_thr->th.th_team != serial_team) {
1220     // Nested level will be an index in the nested nthreads array
1221     int level = this_thr->th.th_team->t.t_level;
1222 
1223     if (serial_team->t.t_serialized) {
1224       /* this serial team was already used
1225          TODO increase performance by making this locks more specific */
1226       kmp_team_t *new_team;
1227 
1228       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1231 #if OMPT_SUPPORT
1232                                      ompt_parallel_data,
1233 #endif
1234 #if OMP_40_ENABLED
1235                                      proc_bind,
1236 #endif
1237                                      &this_thr->th.th_current_task->td_icvs,
1238                                      0 USE_NESTED_HOT_ARG(NULL));
1239       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1240       KMP_ASSERT(new_team);
1241 
1242       /* setup new serialized team and install it */
1243       new_team->t.t_threads[0] = this_thr;
1244       new_team->t.t_parent = this_thr->th.th_team;
1245       serial_team = new_team;
1246       this_thr->th.th_serial_team = serial_team;
1247 
1248       KF_TRACE(
1249           10,
1250           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1251            global_tid, serial_team));
1252 
1253       /* TODO the above breaks the requirement that if we run out of resources,
1254          then we can still guarantee that serialized teams are ok, since we may
1255          need to allocate a new one */
1256     } else {
1257       KF_TRACE(
1258           10,
1259           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1260            global_tid, serial_team));
1261     }
1262 
1263     /* we have to initialize this serial team */
1264     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1267     serial_team->t.t_ident = loc;
1268     serial_team->t.t_serialized = 1;
1269     serial_team->t.t_nproc = 1;
1270     serial_team->t.t_parent = this_thr->th.th_team;
1271     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1272     this_thr->th.th_team = serial_team;
1273     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1274 
1275     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1276                   this_thr->th.th_current_task));
1277     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1278     this_thr->th.th_current_task->td_flags.executing = 0;
1279 
1280     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1281 
1282     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1283        implicit task for each serialized task represented by
1284        team->t.t_serialized? */
1285     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1286               &this_thr->th.th_current_task->td_parent->td_icvs);
1287 
1288     // Thread value exists in the nested nthreads array for the next nested
1289     // level
1290     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291       this_thr->th.th_current_task->td_icvs.nproc =
1292           __kmp_nested_nth.nth[level + 1];
1293     }
1294 
1295 #if OMP_40_ENABLED
1296     if (__kmp_nested_proc_bind.used &&
1297         (level + 1 < __kmp_nested_proc_bind.used)) {
1298       this_thr->th.th_current_task->td_icvs.proc_bind =
1299           __kmp_nested_proc_bind.bind_types[level + 1];
1300     }
1301 #endif /* OMP_40_ENABLED */
1302 
1303 #if USE_DEBUGGER
1304     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1305 #endif
1306     this_thr->th.th_info.ds.ds_tid = 0;
1307 
1308     /* set thread cache values */
1309     this_thr->th.th_team_nproc = 1;
1310     this_thr->th.th_team_master = this_thr;
1311     this_thr->th.th_team_serialized = 1;
1312 
1313     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1314     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1315 
1316     propagateFPControl(serial_team);
1317 
1318     /* check if we need to allocate dispatch buffers stack */
1319     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1320     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1321       serial_team->t.t_dispatch->th_disp_buffer =
1322           (dispatch_private_info_t *)__kmp_allocate(
1323               sizeof(dispatch_private_info_t));
1324     }
1325     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1326 
1327     KMP_MB();
1328 
1329   } else {
1330     /* this serialized team is already being used,
1331      * that's fine, just add another nested level */
1332     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1333     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1334     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1335     ++serial_team->t.t_serialized;
1336     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1337 
1338     // Nested level will be an index in the nested nthreads array
1339     int level = this_thr->th.th_team->t.t_level;
1340     // Thread value exists in the nested nthreads array for the next nested
1341     // level
1342     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1343       this_thr->th.th_current_task->td_icvs.nproc =
1344           __kmp_nested_nth.nth[level + 1];
1345     }
1346     serial_team->t.t_level++;
1347     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1348                   "of serial team %p to %d\n",
1349                   global_tid, serial_team, serial_team->t.t_level));
1350 
1351     /* allocate/push dispatch buffers stack */
1352     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353     {
1354       dispatch_private_info_t *disp_buffer =
1355           (dispatch_private_info_t *)__kmp_allocate(
1356               sizeof(dispatch_private_info_t));
1357       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1358       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1359     }
1360     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1361 
1362     KMP_MB();
1363   }
1364 #if OMP_40_ENABLED
1365   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1366 #endif
1367 
1368   if (__kmp_env_consistency_check)
1369     __kmp_push_parallel(global_tid, NULL);
1370 #if OMPT_SUPPORT
1371   serial_team->t.ompt_team_info.master_return_address = codeptr;
1372   if (ompt_enabled.enabled &&
1373       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
1374     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1375 
1376     ompt_lw_taskteam_t lw_taskteam;
1377     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1378                             &ompt_parallel_data, codeptr);
1379 
1380     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1381     // don't use lw_taskteam after linking. content was swaped
1382 
1383     /* OMPT implicit task begin */
1384     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1385     if (ompt_enabled.ompt_callback_implicit_task) {
1386       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1387           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1388           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
1389     }
1390 
1391     /* OMPT state */
1392     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
1393     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
1394   }
1395 #endif
1396 }
1397 
1398 /* most of the work for a fork */
1399 /* return true if we really went parallel, false if serialized */
1400 int __kmp_fork_call(ident_t *loc, int gtid,
1401                     enum fork_context_e call_context, // Intel, GNU, ...
1402                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
1403 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1404 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1405                     va_list *ap
1406 #else
1407                     va_list ap
1408 #endif
1409                     ) {
1410   void **argv;
1411   int i;
1412   int master_tid;
1413   int master_this_cons;
1414   kmp_team_t *team;
1415   kmp_team_t *parent_team;
1416   kmp_info_t *master_th;
1417   kmp_root_t *root;
1418   int nthreads;
1419   int master_active;
1420   int master_set_numthreads;
1421   int level;
1422 #if OMP_40_ENABLED
1423   int active_level;
1424   int teams_level;
1425 #endif
1426 #if KMP_NESTED_HOT_TEAMS
1427   kmp_hot_team_ptr_t **p_hot_teams;
1428 #endif
1429   { // KMP_TIME_BLOCK
1430     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1431     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1432 
1433     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1434     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1435       /* Some systems prefer the stack for the root thread(s) to start with */
1436       /* some gap from the parent stack to prevent false sharing. */
1437       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1438       /* These 2 lines below are so this does not get optimized out */
1439       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1440         __kmp_stkpadding += (short)((kmp_int64)dummy);
1441     }
1442 
1443     /* initialize if needed */
1444     KMP_DEBUG_ASSERT(
1445         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1446     if (!TCR_4(__kmp_init_parallel))
1447       __kmp_parallel_initialize();
1448 
1449     /* setup current data */
1450     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1451     // shutdown
1452     parent_team = master_th->th.th_team;
1453     master_tid = master_th->th.th_info.ds.ds_tid;
1454     master_this_cons = master_th->th.th_local.this_construct;
1455     root = master_th->th.th_root;
1456     master_active = root->r.r_active;
1457     master_set_numthreads = master_th->th.th_set_nproc;
1458 
1459 #if OMPT_SUPPORT
1460     ompt_data_t ompt_parallel_data;
1461     ompt_parallel_data.ptr = NULL;
1462     ompt_data_t *parent_task_data;
1463     ompt_frame_t *ompt_frame;
1464     ompt_data_t *implicit_task_data;
1465     void *return_address = NULL;
1466 
1467     if (ompt_enabled.enabled) {
1468       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1469                                     NULL, NULL);
1470       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1471     }
1472 #endif
1473 
1474     // Nested level will be an index in the nested nthreads array
1475     level = parent_team->t.t_level;
1476     // used to launch non-serial teams even if nested is not allowed
1477     active_level = parent_team->t.t_active_level;
1478 #if OMP_40_ENABLED
1479     // needed to check nesting inside the teams
1480     teams_level = master_th->th.th_teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483     p_hot_teams = &master_th->th.th_hot_teams;
1484     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1485       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1486           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1487       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1488       // it is either actual or not needed (when active_level > 0)
1489       (*p_hot_teams)[0].hot_team_nth = 1;
1490     }
1491 #endif
1492 
1493 #if OMPT_SUPPORT
1494     if (ompt_enabled.enabled) {
1495       if (ompt_enabled.ompt_callback_parallel_begin) {
1496         int team_size = master_set_numthreads
1497                             ? master_set_numthreads
1498                             : get__nproc_2(parent_team, master_tid);
1499         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1500             parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1501             OMPT_INVOKER(call_context), return_address);
1502       }
1503       master_th->th.ompt_thread_info.state = omp_state_overhead;
1504     }
1505 #endif
1506 
1507     master_th->th.th_ident = loc;
1508 
1509 #if OMP_40_ENABLED
1510     if (master_th->th.th_teams_microtask && ap &&
1511         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1512       // AC: This is start of parallel that is nested inside teams construct.
1513       // The team is actual (hot), all workers are ready at the fork barrier.
1514       // No lock needed to initialize the team a bit, then free workers.
1515       parent_team->t.t_ident = loc;
1516       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1517       parent_team->t.t_argc = argc;
1518       argv = (void **)parent_team->t.t_argv;
1519       for (i = argc - 1; i >= 0; --i)
1520 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1521 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1522         *argv++ = va_arg(*ap, void *);
1523 #else
1524         *argv++ = va_arg(ap, void *);
1525 #endif
1526       // Increment our nested depth levels, but not increase the serialization
1527       if (parent_team == master_th->th.th_serial_team) {
1528         // AC: we are in serialized parallel
1529         __kmpc_serialized_parallel(loc, gtid);
1530         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1531         // AC: need this in order enquiry functions work
1532         // correctly, will restore at join time
1533         parent_team->t.t_serialized--;
1534 #if OMPT_SUPPORT
1535         void *dummy;
1536         void **exit_runtime_p;
1537 
1538         ompt_lw_taskteam_t lw_taskteam;
1539 
1540         if (ompt_enabled.enabled) {
1541           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1542                                   &ompt_parallel_data, return_address);
1543           exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame);
1544 
1545           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1546           // don't use lw_taskteam after linking. content was swaped
1547 
1548           /* OMPT implicit task begin */
1549           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1550           if (ompt_enabled.ompt_callback_implicit_task) {
1551             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1552                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1553                 implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1554           }
1555 
1556           /* OMPT state */
1557           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1558         } else {
1559           exit_runtime_p = &dummy;
1560         }
1561 #endif
1562 
1563         {
1564           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1565           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1566           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1567 #if OMPT_SUPPORT
1568                                  ,
1569                                  exit_runtime_p
1570 #endif
1571                                  );
1572         }
1573 
1574 #if OMPT_SUPPORT
1575         *exit_runtime_p = NULL;
1576         if (ompt_enabled.enabled) {
1577           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL;
1578           if (ompt_enabled.ompt_callback_implicit_task) {
1579             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1580                 ompt_scope_end, NULL, implicit_task_data, 1,
1581                 __kmp_tid_from_gtid(gtid));
1582           }
1583           __ompt_lw_taskteam_unlink(master_th);
1584 
1585           if (ompt_enabled.ompt_callback_parallel_end) {
1586             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1587                 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1588                 OMPT_INVOKER(call_context), return_address);
1589           }
1590           master_th->th.ompt_thread_info.state = omp_state_overhead;
1591         }
1592 #endif
1593         return TRUE;
1594       }
1595 
1596       parent_team->t.t_pkfn = microtask;
1597       parent_team->t.t_invoke = invoker;
1598       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1599       parent_team->t.t_active_level++;
1600       parent_team->t.t_level++;
1601 
1602       /* Change number of threads in the team if requested */
1603       if (master_set_numthreads) { // The parallel has num_threads clause
1604         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1605           // AC: only can reduce number of threads dynamically, can't increase
1606           kmp_info_t **other_threads = parent_team->t.t_threads;
1607           parent_team->t.t_nproc = master_set_numthreads;
1608           for (i = 0; i < master_set_numthreads; ++i) {
1609             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1610           }
1611           // Keep extra threads hot in the team for possible next parallels
1612         }
1613         master_th->th.th_set_nproc = 0;
1614       }
1615 
1616 #if USE_DEBUGGER
1617       if (__kmp_debugging) { // Let debugger override number of threads.
1618         int nth = __kmp_omp_num_threads(loc);
1619         if (nth > 0) { // 0 means debugger doesn't want to change num threads
1620           master_set_numthreads = nth;
1621         }
1622       }
1623 #endif
1624 
1625       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1626                     "master_th=%p, gtid=%d\n",
1627                     root, parent_team, master_th, gtid));
1628       __kmp_internal_fork(loc, gtid, parent_team);
1629       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1630                     "master_th=%p, gtid=%d\n",
1631                     root, parent_team, master_th, gtid));
1632 
1633       /* Invoke microtask for MASTER thread */
1634       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1635                     parent_team->t.t_id, parent_team->t.t_pkfn));
1636 
1637       {
1638         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1639         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1640         if (!parent_team->t.t_invoke(gtid)) {
1641           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642         }
1643       }
1644       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1645                     parent_team->t.t_id, parent_team->t.t_pkfn));
1646       KMP_MB(); /* Flush all pending memory write invalidates.  */
1647 
1648       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1649 
1650       return TRUE;
1651     } // Parallel closely nested in teams construct
1652 #endif /* OMP_40_ENABLED */
1653 
1654 #if KMP_DEBUG
1655     if (__kmp_tasking_mode != tskm_immediate_exec) {
1656       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1657                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1658     }
1659 #endif
1660 
1661     if (parent_team->t.t_active_level >=
1662         master_th->th.th_current_task->td_icvs.max_active_levels) {
1663       nthreads = 1;
1664     } else {
1665 #if OMP_40_ENABLED
1666       int enter_teams = ((ap == NULL && active_level == 0) ||
1667                          (ap && teams_level > 0 && teams_level == level));
1668 #endif
1669       nthreads =
1670           master_set_numthreads
1671               ? master_set_numthreads
1672               : get__nproc_2(
1673                     parent_team,
1674                     master_tid); // TODO: get nproc directly from current task
1675 
1676       // Check if we need to take forkjoin lock? (no need for serialized
1677       // parallel out of teams construct). This code moved here from
1678       // __kmp_reserve_threads() to speedup nested serialized parallels.
1679       if (nthreads > 1) {
1680         if ((!get__nested(master_th) && (root->r.r_in_parallel
1681 #if OMP_40_ENABLED
1682                                          && !enter_teams
1683 #endif /* OMP_40_ENABLED */
1684                                          )) ||
1685             (__kmp_library == library_serial)) {
1686           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1687                         " threads\n",
1688                         gtid, nthreads));
1689           nthreads = 1;
1690         }
1691       }
1692       if (nthreads > 1) {
1693         /* determine how many new threads we can use */
1694         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1695         nthreads = __kmp_reserve_threads(
1696             root, parent_team, master_tid, nthreads
1697 #if OMP_40_ENABLED
1698             /* AC: If we execute teams from parallel region (on host), then
1699                teams should be created but each can only have 1 thread if
1700                nesting is disabled. If teams called from serial region, then
1701                teams and their threads should be created regardless of the
1702                nesting setting. */
1703             ,
1704             enter_teams
1705 #endif /* OMP_40_ENABLED */
1706             );
1707         if (nthreads == 1) {
1708           // Free lock for single thread execution here; for multi-thread
1709           // execution it will be freed later after team of threads created
1710           // and initialized
1711           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1712         }
1713       }
1714     }
1715     KMP_DEBUG_ASSERT(nthreads > 0);
1716 
1717     // If we temporarily changed the set number of threads then restore it now
1718     master_th->th.th_set_nproc = 0;
1719 
1720     /* create a serialized parallel region? */
1721     if (nthreads == 1) {
1722 /* josh todo: hypothetical question: what do we do for OS X*? */
1723 #if KMP_OS_LINUX &&                                                            \
1724     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1725       void *args[argc];
1726 #else
1727       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1728 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1729           KMP_ARCH_AARCH64) */
1730 
1731       KA_TRACE(20,
1732                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1733 
1734       __kmpc_serialized_parallel(loc, gtid);
1735 
1736       if (call_context == fork_context_intel) {
1737         /* TODO this sucks, use the compiler itself to pass args! :) */
1738         master_th->th.th_serial_team->t.t_ident = loc;
1739 #if OMP_40_ENABLED
1740         if (!ap) {
1741           // revert change made in __kmpc_serialized_parallel()
1742           master_th->th.th_serial_team->t.t_level--;
1743 // Get args from parent team for teams construct
1744 
1745 #if OMPT_SUPPORT
1746           void *dummy;
1747           void **exit_runtime_p;
1748           ompt_task_info_t *task_info;
1749 
1750           ompt_lw_taskteam_t lw_taskteam;
1751 
1752           if (ompt_enabled.enabled) {
1753             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1754                                     &ompt_parallel_data, return_address);
1755 
1756             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1757             // don't use lw_taskteam after linking. content was swaped
1758 
1759             task_info = OMPT_CUR_TASK_INFO(master_th);
1760             exit_runtime_p = &(task_info->frame.exit_frame);
1761             if (ompt_enabled.ompt_callback_implicit_task) {
1762               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1764                   &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
1765             }
1766 
1767             /* OMPT state */
1768             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1769           } else {
1770             exit_runtime_p = &dummy;
1771           }
1772 #endif
1773 
1774           {
1775             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1776             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1777             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1778                                    parent_team->t.t_argv
1779 #if OMPT_SUPPORT
1780                                    ,
1781                                    exit_runtime_p
1782 #endif
1783                                    );
1784           }
1785 
1786 #if OMPT_SUPPORT
1787           if (ompt_enabled.enabled) {
1788             exit_runtime_p = NULL;
1789             if (ompt_enabled.ompt_callback_implicit_task) {
1790               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1791                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1792                   __kmp_tid_from_gtid(gtid));
1793             }
1794 
1795             __ompt_lw_taskteam_unlink(master_th);
1796             if (ompt_enabled.ompt_callback_parallel_end) {
1797               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1798                   OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1799                   OMPT_INVOKER(call_context), return_address);
1800             }
1801             master_th->th.ompt_thread_info.state = omp_state_overhead;
1802           }
1803 #endif
1804         } else if (microtask == (microtask_t)__kmp_teams_master) {
1805           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1806                            master_th->th.th_serial_team);
1807           team = master_th->th.th_team;
1808           // team->t.t_pkfn = microtask;
1809           team->t.t_invoke = invoker;
1810           __kmp_alloc_argv_entries(argc, team, TRUE);
1811           team->t.t_argc = argc;
1812           argv = (void **)team->t.t_argv;
1813           if (ap) {
1814             for (i = argc - 1; i >= 0; --i)
1815 // TODO: revert workaround for Intel(R) 64 tracker #96
1816 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1817               *argv++ = va_arg(*ap, void *);
1818 #else
1819               *argv++ = va_arg(ap, void *);
1820 #endif
1821           } else {
1822             for (i = 0; i < argc; ++i)
1823               // Get args from parent team for teams construct
1824               argv[i] = parent_team->t.t_argv[i];
1825           }
1826           // AC: revert change made in __kmpc_serialized_parallel()
1827           //     because initial code in teams should have level=0
1828           team->t.t_level--;
1829           // AC: call special invoker for outer "parallel" of teams construct
1830           {
1831             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1832             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1833             invoker(gtid);
1834           }
1835         } else {
1836 #endif /* OMP_40_ENABLED */
1837           argv = args;
1838           for (i = argc - 1; i >= 0; --i)
1839 // TODO: revert workaround for Intel(R) 64 tracker #96
1840 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1841             *argv++ = va_arg(*ap, void *);
1842 #else
1843           *argv++ = va_arg(ap, void *);
1844 #endif
1845           KMP_MB();
1846 
1847 #if OMPT_SUPPORT
1848           void *dummy;
1849           void **exit_runtime_p;
1850           ompt_task_info_t *task_info;
1851 
1852           ompt_lw_taskteam_t lw_taskteam;
1853 
1854           if (ompt_enabled.enabled) {
1855             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1856                                     &ompt_parallel_data, return_address);
1857             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1858             // don't use lw_taskteam after linking. content was swaped
1859             task_info = OMPT_CUR_TASK_INFO(master_th);
1860             exit_runtime_p = &(task_info->frame.exit_frame);
1861 
1862             /* OMPT implicit task begin */
1863             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1864             if (ompt_enabled.ompt_callback_implicit_task) {
1865               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1866                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1867                   implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
1868             }
1869 
1870             /* OMPT state */
1871             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
1872           } else {
1873             exit_runtime_p = &dummy;
1874           }
1875 #endif
1876 
1877           {
1878             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1879             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1880             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1881 #if OMPT_SUPPORT
1882                                    ,
1883                                    exit_runtime_p
1884 #endif
1885                                    );
1886           }
1887 
1888 #if OMPT_SUPPORT
1889           if (ompt_enabled.enabled) {
1890             *exit_runtime_p = NULL;
1891             if (ompt_enabled.ompt_callback_implicit_task) {
1892               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1893                   ompt_scope_end, NULL, &(task_info->task_data), 1,
1894                   __kmp_tid_from_gtid(gtid));
1895             }
1896 
1897             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1898             __ompt_lw_taskteam_unlink(master_th);
1899             if (ompt_enabled.ompt_callback_parallel_end) {
1900               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1901                   &ompt_parallel_data, parent_task_data,
1902                   OMPT_INVOKER(call_context), return_address);
1903             }
1904             master_th->th.ompt_thread_info.state = omp_state_overhead;
1905           }
1906 #endif
1907 #if OMP_40_ENABLED
1908         }
1909 #endif /* OMP_40_ENABLED */
1910       } else if (call_context == fork_context_gnu) {
1911 #if OMPT_SUPPORT
1912         ompt_lw_taskteam_t lwt;
1913         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1914                                 return_address);
1915 
1916         lwt.ompt_task_info.frame.exit_frame = NULL;
1917         __ompt_lw_taskteam_link(&lwt, master_th, 1);
1918 // don't use lw_taskteam after linking. content was swaped
1919 #endif
1920 
1921         // we were called from GNU native code
1922         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1923         return FALSE;
1924       } else {
1925         KMP_ASSERT2(call_context < fork_context_last,
1926                     "__kmp_fork_call: unknown fork_context parameter");
1927       }
1928 
1929       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930       KMP_MB();
1931       return FALSE;
1932     }
1933 
1934     // GEH: only modify the executing flag in the case when not serialized
1935     //      serialized case is handled in kmpc_serialized_parallel
1936     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1937                   "curtask=%p, curtask_max_aclevel=%d\n",
1938                   parent_team->t.t_active_level, master_th,
1939                   master_th->th.th_current_task,
1940                   master_th->th.th_current_task->td_icvs.max_active_levels));
1941     // TODO: GEH - cannot do this assertion because root thread not set up as
1942     // executing
1943     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1944     master_th->th.th_current_task->td_flags.executing = 0;
1945 
1946 #if OMP_40_ENABLED
1947     if (!master_th->th.th_teams_microtask || level > teams_level)
1948 #endif /* OMP_40_ENABLED */
1949     {
1950       /* Increment our nested depth level */
1951       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1952     }
1953 
1954     // See if we need to make a copy of the ICVs.
1955     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1956     if ((level + 1 < __kmp_nested_nth.used) &&
1957         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1958       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1959     } else {
1960       nthreads_icv = 0; // don't update
1961     }
1962 
1963 #if OMP_40_ENABLED
1964     // Figure out the proc_bind_policy for the new team.
1965     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1966     kmp_proc_bind_t proc_bind_icv =
1967         proc_bind_default; // proc_bind_default means don't update
1968     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1969       proc_bind = proc_bind_false;
1970     } else {
1971       if (proc_bind == proc_bind_default) {
1972         // No proc_bind clause specified; use current proc-bind-var for this
1973         // parallel region
1974         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1975       }
1976       /* else: The proc_bind policy was specified explicitly on parallel clause.
1977          This overrides proc-bind-var for this parallel region, but does not
1978          change proc-bind-var. */
1979       // Figure the value of proc-bind-var for the child threads.
1980       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1981           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1982            master_th->th.th_current_task->td_icvs.proc_bind)) {
1983         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1984       }
1985     }
1986 
1987     // Reset for next parallel region
1988     master_th->th.th_set_proc_bind = proc_bind_default;
1989 #endif /* OMP_40_ENABLED */
1990 
1991     if ((nthreads_icv > 0)
1992 #if OMP_40_ENABLED
1993         || (proc_bind_icv != proc_bind_default)
1994 #endif /* OMP_40_ENABLED */
1995             ) {
1996       kmp_internal_control_t new_icvs;
1997       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1998       new_icvs.next = NULL;
1999       if (nthreads_icv > 0) {
2000         new_icvs.nproc = nthreads_icv;
2001       }
2002 
2003 #if OMP_40_ENABLED
2004       if (proc_bind_icv != proc_bind_default) {
2005         new_icvs.proc_bind = proc_bind_icv;
2006       }
2007 #endif /* OMP_40_ENABLED */
2008 
2009       /* allocate a new parallel team */
2010       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011       team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013                                  ompt_parallel_data,
2014 #endif
2015 #if OMP_40_ENABLED
2016                                  proc_bind,
2017 #endif
2018                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2019     } else {
2020       /* allocate a new parallel team */
2021       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2022       team = __kmp_allocate_team(root, nthreads, nthreads,
2023 #if OMPT_SUPPORT
2024                                  ompt_parallel_data,
2025 #endif
2026 #if OMP_40_ENABLED
2027                                  proc_bind,
2028 #endif
2029                                  &master_th->th.th_current_task->td_icvs,
2030                                  argc USE_NESTED_HOT_ARG(master_th));
2031     }
2032     KF_TRACE(
2033         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2034 
2035     /* setup the new team */
2036     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2037     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2038     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2039     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2040     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2041 #if OMPT_SUPPORT
2042     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2043                           return_address);
2044 #endif
2045     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2046 // TODO: parent_team->t.t_level == INT_MAX ???
2047 #if OMP_40_ENABLED
2048     if (!master_th->th.th_teams_microtask || level > teams_level) {
2049 #endif /* OMP_40_ENABLED */
2050       int new_level = parent_team->t.t_level + 1;
2051       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052       new_level = parent_team->t.t_active_level + 1;
2053       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054 #if OMP_40_ENABLED
2055     } else {
2056       // AC: Do not increase parallel level at start of the teams construct
2057       int new_level = parent_team->t.t_level;
2058       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2059       new_level = parent_team->t.t_active_level;
2060       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2061     }
2062 #endif /* OMP_40_ENABLED */
2063     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2064     // set master's schedule as new run-time schedule
2065     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2066 
2067 #if OMP_40_ENABLED
2068     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069 #endif
2070 
2071     // Update the floating point rounding in the team if required.
2072     propagateFPControl(team);
2073 
2074     if (__kmp_tasking_mode != tskm_immediate_exec) {
2075       // Set master's task team to team's task team. Unless this is hot team, it
2076       // should be NULL.
2077       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2079       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080                     "%p, new task_team %p / team %p\n",
2081                     __kmp_gtid_from_thread(master_th),
2082                     master_th->th.th_task_team, parent_team,
2083                     team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085       if (active_level || master_th->th.th_task_team) {
2086         // Take a memo of master's task_state
2087         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088         if (master_th->th.th_task_state_top >=
2089             master_th->th.th_task_state_stack_sz) { // increase size
2090           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091           kmp_uint8 *old_stack, *new_stack;
2092           kmp_uint32 i;
2093           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096           }
2097           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098                ++i) { // zero-init rest of stack
2099             new_stack[i] = 0;
2100           }
2101           old_stack = master_th->th.th_task_state_memo_stack;
2102           master_th->th.th_task_state_memo_stack = new_stack;
2103           master_th->th.th_task_state_stack_sz = new_size;
2104           __kmp_free(old_stack);
2105         }
2106         // Store master's task_state on stack
2107         master_th->th
2108             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109             master_th->th.th_task_state;
2110         master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112         if (team == master_th->th.th_hot_teams[active_level].hot_team) {
2113           // Restore master's nested state if nested hot team
2114           master_th->th.th_task_state =
2115               master_th->th
2116                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2117         } else {
2118 #endif
2119           master_th->th.th_task_state = 0;
2120 #if KMP_NESTED_HOT_TEAMS
2121         }
2122 #endif
2123       }
2124 #if !KMP_NESTED_HOT_TEAMS
2125       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2126                        (team == root->r.r_hot_team));
2127 #endif
2128     }
2129 
2130     KA_TRACE(
2131         20,
2132         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2133          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2134          team->t.t_nproc));
2135     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2136                      (team->t.t_master_tid == 0 &&
2137                       (team->t.t_parent == root->r.r_root_team ||
2138                        team->t.t_parent->t.t_serialized)));
2139     KMP_MB();
2140 
2141     /* now, setup the arguments */
2142     argv = (void **)team->t.t_argv;
2143 #if OMP_40_ENABLED
2144     if (ap) {
2145 #endif /* OMP_40_ENABLED */
2146       for (i = argc - 1; i >= 0; --i) {
2147 // TODO: revert workaround for Intel(R) 64 tracker #96
2148 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2149         void *new_argv = va_arg(*ap, void *);
2150 #else
2151       void *new_argv = va_arg(ap, void *);
2152 #endif
2153         KMP_CHECK_UPDATE(*argv, new_argv);
2154         argv++;
2155       }
2156 #if OMP_40_ENABLED
2157     } else {
2158       for (i = 0; i < argc; ++i) {
2159         // Get args from parent team for teams construct
2160         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2161       }
2162     }
2163 #endif /* OMP_40_ENABLED */
2164 
2165     /* now actually fork the threads */
2166     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2167     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2168       root->r.r_active = TRUE;
2169 
2170     __kmp_fork_team_threads(root, team, master_th, gtid);
2171     __kmp_setup_icv_copy(team, nthreads,
2172                          &master_th->th.th_current_task->td_icvs, loc);
2173 
2174 #if OMPT_SUPPORT
2175     master_th->th.ompt_thread_info.state = omp_state_work_parallel;
2176 #endif
2177 
2178     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2179 
2180 #if USE_ITT_BUILD
2181     if (team->t.t_active_level == 1 // only report frames at level 1
2182 #if OMP_40_ENABLED
2183         && !master_th->th.th_teams_microtask // not in teams construct
2184 #endif /* OMP_40_ENABLED */
2185         ) {
2186 #if USE_ITT_NOTIFY
2187       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2188           (__kmp_forkjoin_frames_mode == 3 ||
2189            __kmp_forkjoin_frames_mode == 1)) {
2190         kmp_uint64 tmp_time = 0;
2191         if (__itt_get_timestamp_ptr)
2192           tmp_time = __itt_get_timestamp();
2193         // Internal fork - report frame begin
2194         master_th->th.th_frame_time = tmp_time;
2195         if (__kmp_forkjoin_frames_mode == 3)
2196           team->t.t_region_time = tmp_time;
2197       } else
2198 // only one notification scheme (either "submit" or "forking/joined", not both)
2199 #endif /* USE_ITT_NOTIFY */
2200           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2201               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2202         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2203         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2204       }
2205     }
2206 #endif /* USE_ITT_BUILD */
2207 
2208     /* now go on and do the work */
2209     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2210     KMP_MB();
2211     KF_TRACE(10,
2212              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2213               root, team, master_th, gtid));
2214 
2215 #if USE_ITT_BUILD
2216     if (__itt_stack_caller_create_ptr) {
2217       team->t.t_stack_id =
2218           __kmp_itt_stack_caller_create(); // create new stack stitching id
2219       // before entering fork barrier
2220     }
2221 #endif /* USE_ITT_BUILD */
2222 
2223 #if OMP_40_ENABLED
2224     // AC: skip __kmp_internal_fork at teams construct, let only master
2225     // threads execute
2226     if (ap)
2227 #endif /* OMP_40_ENABLED */
2228     {
2229       __kmp_internal_fork(loc, gtid, team);
2230       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2231                     "master_th=%p, gtid=%d\n",
2232                     root, team, master_th, gtid));
2233     }
2234 
2235     if (call_context == fork_context_gnu) {
2236       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2237       return TRUE;
2238     }
2239 
2240     /* Invoke microtask for MASTER thread */
2241     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2242                   team->t.t_id, team->t.t_pkfn));
2243   } // END of timer KMP_fork_call block
2244 
2245   {
2246     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2247     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2248     if (!team->t.t_invoke(gtid)) {
2249       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2250     }
2251   }
2252   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2253                 team->t.t_id, team->t.t_pkfn));
2254   KMP_MB(); /* Flush all pending memory write invalidates.  */
2255 
2256   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2257 
2258 #if OMPT_SUPPORT
2259   if (ompt_enabled.enabled) {
2260     master_th->th.ompt_thread_info.state = omp_state_overhead;
2261   }
2262 #endif
2263 
2264   return TRUE;
2265 }
2266 
2267 #if OMPT_SUPPORT
2268 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2269                                             kmp_team_t *team) {
2270   // restore state outside the region
2271   thread->th.ompt_thread_info.state =
2272       ((team->t.t_serialized) ? omp_state_work_serial
2273                               : omp_state_work_parallel);
2274 }
2275 
2276 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2277                                    kmp_team_t *team, ompt_data_t *parallel_data,
2278                                    fork_context_e fork_context, void *codeptr) {
2279   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2280   if (ompt_enabled.ompt_callback_parallel_end) {
2281     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2282         parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2283         codeptr);
2284   }
2285 
2286   task_info->frame.enter_frame = NULL;
2287   __kmp_join_restore_state(thread, team);
2288 }
2289 #endif
2290 
2291 void __kmp_join_call(ident_t *loc, int gtid
2292 #if OMPT_SUPPORT
2293                      ,
2294                      enum fork_context_e fork_context
2295 #endif
2296 #if OMP_40_ENABLED
2297                      ,
2298                      int exit_teams
2299 #endif /* OMP_40_ENABLED */
2300                      ) {
2301   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2302   kmp_team_t *team;
2303   kmp_team_t *parent_team;
2304   kmp_info_t *master_th;
2305   kmp_root_t *root;
2306   int master_active;
2307   int i;
2308 
2309   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2310 
2311   /* setup current data */
2312   master_th = __kmp_threads[gtid];
2313   root = master_th->th.th_root;
2314   team = master_th->th.th_team;
2315   parent_team = team->t.t_parent;
2316 
2317   master_th->th.th_ident = loc;
2318 
2319 #if OMPT_SUPPORT
2320   if (ompt_enabled.enabled) {
2321     master_th->th.ompt_thread_info.state = omp_state_overhead;
2322   }
2323 #endif
2324 
2325 #if KMP_DEBUG
2326   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2327     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2328                   "th_task_team = %p\n",
2329                   __kmp_gtid_from_thread(master_th), team,
2330                   team->t.t_task_team[master_th->th.th_task_state],
2331                   master_th->th.th_task_team));
2332     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2333                      team->t.t_task_team[master_th->th.th_task_state]);
2334   }
2335 #endif
2336 
2337   if (team->t.t_serialized) {
2338 #if OMP_40_ENABLED
2339     if (master_th->th.th_teams_microtask) {
2340       // We are in teams construct
2341       int level = team->t.t_level;
2342       int tlevel = master_th->th.th_teams_level;
2343       if (level == tlevel) {
2344         // AC: we haven't incremented it earlier at start of teams construct,
2345         //     so do it here - at the end of teams construct
2346         team->t.t_level++;
2347       } else if (level == tlevel + 1) {
2348         // AC: we are exiting parallel inside teams, need to increment
2349         // serialization in order to restore it in the next call to
2350         // __kmpc_end_serialized_parallel
2351         team->t.t_serialized++;
2352       }
2353     }
2354 #endif /* OMP_40_ENABLED */
2355     __kmpc_end_serialized_parallel(loc, gtid);
2356 
2357 #if OMPT_SUPPORT
2358     if (ompt_enabled.enabled) {
2359       __kmp_join_restore_state(master_th, parent_team);
2360     }
2361 #endif
2362 
2363     return;
2364   }
2365 
2366   master_active = team->t.t_master_active;
2367 
2368 #if OMP_40_ENABLED
2369   if (!exit_teams)
2370 #endif /* OMP_40_ENABLED */
2371   {
2372     // AC: No barrier for internal teams at exit from teams construct.
2373     //     But there is barrier for external team (league).
2374     __kmp_internal_join(loc, gtid, team);
2375   }
2376 #if OMP_40_ENABLED
2377   else {
2378     master_th->th.th_task_state =
2379         0; // AC: no tasking in teams (out of any parallel)
2380   }
2381 #endif /* OMP_40_ENABLED */
2382 
2383   KMP_MB();
2384 
2385 #if OMPT_SUPPORT
2386   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2387   void *codeptr = team->t.ompt_team_info.master_return_address;
2388 #endif
2389 
2390 #if USE_ITT_BUILD
2391   if (__itt_stack_caller_create_ptr) {
2392     __kmp_itt_stack_caller_destroy(
2393         (__itt_caller)team->t
2394             .t_stack_id); // destroy the stack stitching id after join barrier
2395   }
2396 
2397   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2398   if (team->t.t_active_level == 1
2399 #if OMP_40_ENABLED
2400       && !master_th->th.th_teams_microtask /* not in teams construct */
2401 #endif /* OMP_40_ENABLED */
2402       ) {
2403     master_th->th.th_ident = loc;
2404     // only one notification scheme (either "submit" or "forking/joined", not
2405     // both)
2406     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2407         __kmp_forkjoin_frames_mode == 3)
2408       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2409                              master_th->th.th_frame_time, 0, loc,
2410                              master_th->th.th_team_nproc, 1);
2411     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2412              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2413       __kmp_itt_region_joined(gtid);
2414   } // active_level == 1
2415 #endif /* USE_ITT_BUILD */
2416 
2417 #if OMP_40_ENABLED
2418   if (master_th->th.th_teams_microtask && !exit_teams &&
2419       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2420       team->t.t_level == master_th->th.th_teams_level + 1) {
2421     // AC: We need to leave the team structure intact at the end of parallel
2422     // inside the teams construct, so that at the next parallel same (hot) team
2423     // works, only adjust nesting levels
2424 
2425     /* Decrement our nested depth level */
2426     team->t.t_level--;
2427     team->t.t_active_level--;
2428     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2429 
2430     /* Restore number of threads in the team if needed */
2431     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2432       int old_num = master_th->th.th_team_nproc;
2433       int new_num = master_th->th.th_teams_size.nth;
2434       kmp_info_t **other_threads = team->t.t_threads;
2435       team->t.t_nproc = new_num;
2436       for (i = 0; i < old_num; ++i) {
2437         other_threads[i]->th.th_team_nproc = new_num;
2438       }
2439       // Adjust states of non-used threads of the team
2440       for (i = old_num; i < new_num; ++i) {
2441         // Re-initialize thread's barrier data.
2442         int b;
2443         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2444         for (b = 0; b < bs_last_barrier; ++b) {
2445           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2446           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2447 #if USE_DEBUGGER
2448           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2449 #endif
2450         }
2451         if (__kmp_tasking_mode != tskm_immediate_exec) {
2452           // Synchronize thread's task state
2453           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2454         }
2455       }
2456     }
2457 
2458 #if OMPT_SUPPORT
2459     if (ompt_enabled.enabled) {
2460       __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2461                       codeptr);
2462     }
2463 #endif
2464 
2465     return;
2466   }
2467 #endif /* OMP_40_ENABLED */
2468 
2469   /* do cleanup and restore the parent team */
2470   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2471   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2472 
2473   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2474 
2475   /* jc: The following lock has instructions with REL and ACQ semantics,
2476      separating the parallel user code called in this parallel region
2477      from the serial user code called after this function returns. */
2478   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2479 
2480 #if OMP_40_ENABLED
2481   if (!master_th->th.th_teams_microtask ||
2482       team->t.t_level > master_th->th.th_teams_level)
2483 #endif /* OMP_40_ENABLED */
2484   {
2485     /* Decrement our nested depth level */
2486     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2487   }
2488   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491   if (ompt_enabled.enabled) {
2492     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493     if (ompt_enabled.ompt_callback_implicit_task) {
2494       int ompt_team_size = team->t.t_nproc;
2495       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2496           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2497           __kmp_tid_from_gtid(gtid));
2498     }
2499 
2500     task_info->frame.exit_frame = NULL;
2501     task_info->task_data = ompt_data_none;
2502   }
2503 #endif
2504 
2505   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2506                 master_th, team));
2507   __kmp_pop_current_task_from_thread(master_th);
2508 
2509 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2510   // Restore master thread's partition.
2511   master_th->th.th_first_place = team->t.t_first_place;
2512   master_th->th.th_last_place = team->t.t_last_place;
2513 #endif /* OMP_40_ENABLED */
2514 
2515   updateHWFPControl(team);
2516 
2517   if (root->r.r_active != master_active)
2518     root->r.r_active = master_active;
2519 
2520   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2521                             master_th)); // this will free worker threads
2522 
2523   /* this race was fun to find. make sure the following is in the critical
2524      region otherwise assertions may fail occasionally since the old team may be
2525      reallocated and the hierarchy appears inconsistent. it is actually safe to
2526      run and won't cause any bugs, but will cause those assertion failures. it's
2527      only one deref&assign so might as well put this in the critical region */
2528   master_th->th.th_team = parent_team;
2529   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2530   master_th->th.th_team_master = parent_team->t.t_threads[0];
2531   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2532 
2533   /* restore serialized team, if need be */
2534   if (parent_team->t.t_serialized &&
2535       parent_team != master_th->th.th_serial_team &&
2536       parent_team != root->r.r_root_team) {
2537     __kmp_free_team(root,
2538                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2539     master_th->th.th_serial_team = parent_team;
2540   }
2541 
2542   if (__kmp_tasking_mode != tskm_immediate_exec) {
2543     if (master_th->th.th_task_state_top >
2544         0) { // Restore task state from memo stack
2545       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2546       // Remember master's state if we re-use this nested hot team
2547       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2548           master_th->th.th_task_state;
2549       --master_th->th.th_task_state_top; // pop
2550       // Now restore state at this level
2551       master_th->th.th_task_state =
2552           master_th->th
2553               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2554     }
2555     // Copy the task team from the parent team to the master thread
2556     master_th->th.th_task_team =
2557         parent_team->t.t_task_team[master_th->th.th_task_state];
2558     KA_TRACE(20,
2559              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2560               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2561               parent_team));
2562   }
2563 
2564   // TODO: GEH - cannot do this assertion because root thread not set up as
2565   // executing
2566   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2567   master_th->th.th_current_task->td_flags.executing = 1;
2568 
2569   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2570 
2571 #if OMPT_SUPPORT
2572   if (ompt_enabled.enabled) {
2573     __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2574                     codeptr);
2575   }
2576 #endif
2577 
2578   KMP_MB();
2579   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2580 }
2581 
2582 /* Check whether we should push an internal control record onto the
2583    serial team stack.  If so, do it.  */
2584 void __kmp_save_internal_controls(kmp_info_t *thread) {
2585 
2586   if (thread->th.th_team != thread->th.th_serial_team) {
2587     return;
2588   }
2589   if (thread->th.th_team->t.t_serialized > 1) {
2590     int push = 0;
2591 
2592     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2593       push = 1;
2594     } else {
2595       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2596           thread->th.th_team->t.t_serialized) {
2597         push = 1;
2598       }
2599     }
2600     if (push) { /* push a record on the serial team's stack */
2601       kmp_internal_control_t *control =
2602           (kmp_internal_control_t *)__kmp_allocate(
2603               sizeof(kmp_internal_control_t));
2604 
2605       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2606 
2607       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2608 
2609       control->next = thread->th.th_team->t.t_control_stack_top;
2610       thread->th.th_team->t.t_control_stack_top = control;
2611     }
2612   }
2613 }
2614 
2615 /* Changes set_nproc */
2616 void __kmp_set_num_threads(int new_nth, int gtid) {
2617   kmp_info_t *thread;
2618   kmp_root_t *root;
2619 
2620   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2621   KMP_DEBUG_ASSERT(__kmp_init_serial);
2622 
2623   if (new_nth < 1)
2624     new_nth = 1;
2625   else if (new_nth > __kmp_max_nth)
2626     new_nth = __kmp_max_nth;
2627 
2628   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2629   thread = __kmp_threads[gtid];
2630 
2631   __kmp_save_internal_controls(thread);
2632 
2633   set__nproc(thread, new_nth);
2634 
2635   // If this omp_set_num_threads() call will cause the hot team size to be
2636   // reduced (in the absence of a num_threads clause), then reduce it now,
2637   // rather than waiting for the next parallel region.
2638   root = thread->th.th_root;
2639   if (__kmp_init_parallel && (!root->r.r_active) &&
2640       (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644       ) {
2645     kmp_team_t *hot_team = root->r.r_hot_team;
2646     int f;
2647 
2648     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650     // Release the extra threads we don't need any more.
2651     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653       if (__kmp_tasking_mode != tskm_immediate_exec) {
2654         // When decreasing team size, threads no longer in the team should unref
2655         // task team.
2656         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657       }
2658       __kmp_free_thread(hot_team->t.t_threads[f]);
2659       hot_team->t.t_threads[f] = NULL;
2660     }
2661     hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663     if (thread->th.th_hot_teams) {
2664       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666     }
2667 #endif
2668 
2669     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671     // Update the t_nproc field in the threads that are still active.
2672     for (f = 0; f < new_nth; f++) {
2673       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675     }
2676     // Special flag in case omp_set_num_threads() call
2677     hot_team->t.t_size_changed = -1;
2678   }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683   kmp_info_t *thread;
2684 
2685   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686                 "%d = (%d)\n",
2687                 gtid, max_active_levels));
2688   KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690   // validate max_active_levels
2691   if (max_active_levels < 0) {
2692     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693     // We ignore this call if the user has specified a negative value.
2694     // The current setting won't be changed. The last valid setting will be
2695     // used. A warning will be issued (if warnings are allowed as controlled by
2696     // the KMP_WARNINGS env var).
2697     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698                   "max_active_levels for thread %d = (%d)\n",
2699                   gtid, max_active_levels));
2700     return;
2701   }
2702   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703     // it's OK, the max_active_levels is within the valid range: [ 0;
2704     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705     // We allow a zero value. (implementation defined behavior)
2706   } else {
2707     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710     // Current upper limit is MAX_INT. (implementation defined behavior)
2711     // If the input exceeds the upper limit, we correct the input to be the
2712     // upper limit. (implementation defined behavior)
2713     // Actually, the flow should never get here until we use MAX_INT limit.
2714   }
2715   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716                 "max_active_levels for thread %d = (%d)\n",
2717                 gtid, max_active_levels));
2718 
2719   thread = __kmp_threads[gtid];
2720 
2721   __kmp_save_internal_controls(thread);
2722 
2723   set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728   kmp_info_t *thread;
2729 
2730   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731   KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733   thread = __kmp_threads[gtid];
2734   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736                 "curtask_maxaclevel=%d\n",
2737                 gtid, thread->th.th_current_task,
2738                 thread->th.th_current_task->td_icvs.max_active_levels));
2739   return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744   kmp_info_t *thread;
2745   //    kmp_team_t *team;
2746 
2747   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2748                 gtid, (int)kind, chunk));
2749   KMP_DEBUG_ASSERT(__kmp_init_serial);
2750 
2751   // Check if the kind parameter is valid, correct if needed.
2752   // Valid parameters should fit in one of two intervals - standard or extended:
2753   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2754   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2755   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2756       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2757     // TODO: Hint needs attention in case we change the default schedule.
2758     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2759               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2760               __kmp_msg_null);
2761     kind = kmp_sched_default;
2762     chunk = 0; // ignore chunk value in case of bad kind
2763   }
2764 
2765   thread = __kmp_threads[gtid];
2766 
2767   __kmp_save_internal_controls(thread);
2768 
2769   if (kind < kmp_sched_upper_std) {
2770     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2771       // differ static chunked vs. unchunked:  chunk should be invalid to
2772       // indicate unchunked schedule (which is the default)
2773       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2774     } else {
2775       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2776           __kmp_sch_map[kind - kmp_sched_lower - 1];
2777     }
2778   } else {
2779     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2780     //    kmp_sched_lower - 2 ];
2781     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2782         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2783                       kmp_sched_lower - 2];
2784   }
2785   if (kind == kmp_sched_auto || chunk < 1) {
2786     // ignore parameter chunk for schedule auto
2787     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2788   } else {
2789     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2790   }
2791 }
2792 
2793 /* Gets def_sched_var ICV values */
2794 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2795   kmp_info_t *thread;
2796   enum sched_type th_type;
2797 
2798   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2799   KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801   thread = __kmp_threads[gtid];
2802 
2803   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2804 
2805   switch (th_type) {
2806   case kmp_sch_static:
2807   case kmp_sch_static_greedy:
2808   case kmp_sch_static_balanced:
2809     *kind = kmp_sched_static;
2810     *chunk = 0; // chunk was not set, try to show this fact via zero value
2811     return;
2812   case kmp_sch_static_chunked:
2813     *kind = kmp_sched_static;
2814     break;
2815   case kmp_sch_dynamic_chunked:
2816     *kind = kmp_sched_dynamic;
2817     break;
2818   case kmp_sch_guided_chunked:
2819   case kmp_sch_guided_iterative_chunked:
2820   case kmp_sch_guided_analytical_chunked:
2821     *kind = kmp_sched_guided;
2822     break;
2823   case kmp_sch_auto:
2824     *kind = kmp_sched_auto;
2825     break;
2826   case kmp_sch_trapezoidal:
2827     *kind = kmp_sched_trapezoidal;
2828     break;
2829 #if KMP_STATIC_STEAL_ENABLED
2830   case kmp_sch_static_steal:
2831     *kind = kmp_sched_static_steal;
2832     break;
2833 #endif
2834   default:
2835     KMP_FATAL(UnknownSchedulingType, th_type);
2836   }
2837 
2838   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2839 }
2840 
2841 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2842 
2843   int ii, dd;
2844   kmp_team_t *team;
2845   kmp_info_t *thr;
2846 
2847   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2848   KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850   // validate level
2851   if (level == 0)
2852     return 0;
2853   if (level < 0)
2854     return -1;
2855   thr = __kmp_threads[gtid];
2856   team = thr->th.th_team;
2857   ii = team->t.t_level;
2858   if (level > ii)
2859     return -1;
2860 
2861 #if OMP_40_ENABLED
2862   if (thr->th.th_teams_microtask) {
2863     // AC: we are in teams region where multiple nested teams have same level
2864     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2865     if (level <=
2866         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2867       KMP_DEBUG_ASSERT(ii >= tlevel);
2868       // AC: As we need to pass by the teams league, we need to artificially
2869       // increase ii
2870       if (ii == tlevel) {
2871         ii += 2; // three teams have same level
2872       } else {
2873         ii++; // two teams have same level
2874       }
2875     }
2876   }
2877 #endif
2878 
2879   if (ii == level)
2880     return __kmp_tid_from_gtid(gtid);
2881 
2882   dd = team->t.t_serialized;
2883   level++;
2884   while (ii > level) {
2885     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2886     }
2887     if ((team->t.t_serialized) && (!dd)) {
2888       team = team->t.t_parent;
2889       continue;
2890     }
2891     if (ii > level) {
2892       team = team->t.t_parent;
2893       dd = team->t.t_serialized;
2894       ii--;
2895     }
2896   }
2897 
2898   return (dd > 1) ? (0) : (team->t.t_master_tid);
2899 }
2900 
2901 int __kmp_get_team_size(int gtid, int level) {
2902 
2903   int ii, dd;
2904   kmp_team_t *team;
2905   kmp_info_t *thr;
2906 
2907   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2908   KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910   // validate level
2911   if (level == 0)
2912     return 1;
2913   if (level < 0)
2914     return -1;
2915   thr = __kmp_threads[gtid];
2916   team = thr->th.th_team;
2917   ii = team->t.t_level;
2918   if (level > ii)
2919     return -1;
2920 
2921 #if OMP_40_ENABLED
2922   if (thr->th.th_teams_microtask) {
2923     // AC: we are in teams region where multiple nested teams have same level
2924     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2925     if (level <=
2926         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2927       KMP_DEBUG_ASSERT(ii >= tlevel);
2928       // AC: As we need to pass by the teams league, we need to artificially
2929       // increase ii
2930       if (ii == tlevel) {
2931         ii += 2; // three teams have same level
2932       } else {
2933         ii++; // two teams have same level
2934       }
2935     }
2936   }
2937 #endif
2938 
2939   while (ii > level) {
2940     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2941     }
2942     if (team->t.t_serialized && (!dd)) {
2943       team = team->t.t_parent;
2944       continue;
2945     }
2946     if (ii > level) {
2947       team = team->t.t_parent;
2948       ii--;
2949     }
2950   }
2951 
2952   return team->t.t_nproc;
2953 }
2954 
2955 kmp_r_sched_t __kmp_get_schedule_global() {
2956   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2957   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2958   // independently. So one can get the updated schedule here.
2959 
2960   kmp_r_sched_t r_sched;
2961 
2962   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2963   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2964   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2965   // different roots (even in OMP 2.5)
2966   if (__kmp_sched == kmp_sch_static) {
2967     // replace STATIC with more detailed schedule (balanced or greedy)
2968     r_sched.r_sched_type = __kmp_static;
2969   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2970     // replace GUIDED with more detailed schedule (iterative or analytical)
2971     r_sched.r_sched_type = __kmp_guided;
2972   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2973     r_sched.r_sched_type = __kmp_sched;
2974   }
2975 
2976   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2977     // __kmp_chunk may be wrong here (if it was not ever set)
2978     r_sched.chunk = KMP_DEFAULT_CHUNK;
2979   } else {
2980     r_sched.chunk = __kmp_chunk;
2981   }
2982 
2983   return r_sched;
2984 }
2985 
2986 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2987    at least argc number of *t_argv entries for the requested team. */
2988 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2989 
2990   KMP_DEBUG_ASSERT(team);
2991   if (!realloc || argc > team->t.t_max_argc) {
2992 
2993     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2994                    "current entries=%d\n",
2995                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2996     /* if previously allocated heap space for args, free them */
2997     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2998       __kmp_free((void *)team->t.t_argv);
2999 
3000     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3001       /* use unused space in the cache line for arguments */
3002       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3003       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3004                      "argv entries\n",
3005                      team->t.t_id, team->t.t_max_argc));
3006       team->t.t_argv = &team->t.t_inline_argv[0];
3007       if (__kmp_storage_map) {
3008         __kmp_print_storage_map_gtid(
3009             -1, &team->t.t_inline_argv[0],
3010             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3011             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3012             team->t.t_id);
3013       }
3014     } else {
3015       /* allocate space for arguments in the heap */
3016       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3017                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
3018                                : 2 * argc;
3019       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3020                      "argv entries\n",
3021                      team->t.t_id, team->t.t_max_argc));
3022       team->t.t_argv =
3023           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3024       if (__kmp_storage_map) {
3025         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3026                                      &team->t.t_argv[team->t.t_max_argc],
3027                                      sizeof(void *) * team->t.t_max_argc,
3028                                      "team_%d.t_argv", team->t.t_id);
3029       }
3030     }
3031   }
3032 }
3033 
3034 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3035   int i;
3036   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3037   team->t.t_threads =
3038       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3039   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3040       sizeof(dispatch_shared_info_t) * num_disp_buff);
3041   team->t.t_dispatch =
3042       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3043   team->t.t_implicit_task_taskdata =
3044       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3045   team->t.t_max_nproc = max_nth;
3046 
3047   /* setup dispatch buffers */
3048   for (i = 0; i < num_disp_buff; ++i) {
3049     team->t.t_disp_buffer[i].buffer_index = i;
3050 #if OMP_45_ENABLED
3051     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3052 #endif
3053   }
3054 }
3055 
3056 static void __kmp_free_team_arrays(kmp_team_t *team) {
3057   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3058   int i;
3059   for (i = 0; i < team->t.t_max_nproc; ++i) {
3060     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3061       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3062       team->t.t_dispatch[i].th_disp_buffer = NULL;
3063     }
3064   }
3065   __kmp_free(team->t.t_threads);
3066   __kmp_free(team->t.t_disp_buffer);
3067   __kmp_free(team->t.t_dispatch);
3068   __kmp_free(team->t.t_implicit_task_taskdata);
3069   team->t.t_threads = NULL;
3070   team->t.t_disp_buffer = NULL;
3071   team->t.t_dispatch = NULL;
3072   team->t.t_implicit_task_taskdata = 0;
3073 }
3074 
3075 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3076   kmp_info_t **oldThreads = team->t.t_threads;
3077 
3078   __kmp_free(team->t.t_disp_buffer);
3079   __kmp_free(team->t.t_dispatch);
3080   __kmp_free(team->t.t_implicit_task_taskdata);
3081   __kmp_allocate_team_arrays(team, max_nth);
3082 
3083   KMP_MEMCPY(team->t.t_threads, oldThreads,
3084              team->t.t_nproc * sizeof(kmp_info_t *));
3085 
3086   __kmp_free(oldThreads);
3087 }
3088 
3089 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3090 
3091   kmp_r_sched_t r_sched =
3092       __kmp_get_schedule_global(); // get current state of scheduling globals
3093 
3094 #if OMP_40_ENABLED
3095   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3096 #endif /* OMP_40_ENABLED */
3097 
3098   kmp_internal_control_t g_icvs = {
3099     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3100     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3101     // for nested parallelism (per thread)
3102     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3103     // adjustment of threads (per thread)
3104     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3105     // whether blocktime is explicitly set
3106     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3107 #if KMP_USE_MONITOR
3108     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3109 // intervals
3110 #endif
3111     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3112     // next parallel region (per thread)
3113     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3114     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3115     // for max_active_levels
3116     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3117 // {sched,chunk} pair
3118 #if OMP_40_ENABLED
3119     __kmp_nested_proc_bind.bind_types[0],
3120     __kmp_default_device,
3121 #endif /* OMP_40_ENABLED */
3122     NULL // struct kmp_internal_control *next;
3123   };
3124 
3125   return g_icvs;
3126 }
3127 
3128 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3129 
3130   kmp_internal_control_t gx_icvs;
3131   gx_icvs.serial_nesting_level =
3132       0; // probably =team->t.t_serial like in save_inter_controls
3133   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3134   gx_icvs.next = NULL;
3135 
3136   return gx_icvs;
3137 }
3138 
3139 static void __kmp_initialize_root(kmp_root_t *root) {
3140   int f;
3141   kmp_team_t *root_team;
3142   kmp_team_t *hot_team;
3143   int hot_team_max_nth;
3144   kmp_r_sched_t r_sched =
3145       __kmp_get_schedule_global(); // get current state of scheduling globals
3146   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3147   KMP_DEBUG_ASSERT(root);
3148   KMP_ASSERT(!root->r.r_begin);
3149 
3150   /* setup the root state structure */
3151   __kmp_init_lock(&root->r.r_begin_lock);
3152   root->r.r_begin = FALSE;
3153   root->r.r_active = FALSE;
3154   root->r.r_in_parallel = 0;
3155   root->r.r_blocktime = __kmp_dflt_blocktime;
3156   root->r.r_nested = __kmp_dflt_nested;
3157   root->r.r_cg_nthreads = 1;
3158 
3159   /* setup the root team for this task */
3160   /* allocate the root team structure */
3161   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162 
3163   root_team =
3164       __kmp_allocate_team(root,
3165                           1, // new_nproc
3166                           1, // max_nproc
3167 #if OMPT_SUPPORT
3168                           ompt_data_none, // root parallel id
3169 #endif
3170 #if OMP_40_ENABLED
3171                           __kmp_nested_proc_bind.bind_types[0],
3172 #endif
3173                           &r_icvs,
3174                           0 // argc
3175                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3176                           );
3177 #if USE_DEBUGGER
3178   // Non-NULL value should be assigned to make the debugger display the root
3179   // team.
3180   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3181 #endif
3182 
3183   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3184 
3185   root->r.r_root_team = root_team;
3186   root_team->t.t_control_stack_top = NULL;
3187 
3188   /* initialize root team */
3189   root_team->t.t_threads[0] = NULL;
3190   root_team->t.t_nproc = 1;
3191   root_team->t.t_serialized = 1;
3192   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3193   root_team->t.t_sched.sched = r_sched.sched;
3194   KA_TRACE(
3195       20,
3196       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3197        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3198 
3199   /* setup the  hot team for this task */
3200   /* allocate the hot team structure */
3201   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3202 
3203   hot_team =
3204       __kmp_allocate_team(root,
3205                           1, // new_nproc
3206                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3207 #if OMPT_SUPPORT
3208                           ompt_data_none, // root parallel id
3209 #endif
3210 #if OMP_40_ENABLED
3211                           __kmp_nested_proc_bind.bind_types[0],
3212 #endif
3213                           &r_icvs,
3214                           0 // argc
3215                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3216                           );
3217   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3218 
3219   root->r.r_hot_team = hot_team;
3220   root_team->t.t_control_stack_top = NULL;
3221 
3222   /* first-time initialization */
3223   hot_team->t.t_parent = root_team;
3224 
3225   /* initialize hot team */
3226   hot_team_max_nth = hot_team->t.t_max_nproc;
3227   for (f = 0; f < hot_team_max_nth; ++f) {
3228     hot_team->t.t_threads[f] = NULL;
3229   }
3230   hot_team->t.t_nproc = 1;
3231   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3232   hot_team->t.t_sched.sched = r_sched.sched;
3233   hot_team->t.t_size_changed = 0;
3234 }
3235 
3236 #ifdef KMP_DEBUG
3237 
3238 typedef struct kmp_team_list_item {
3239   kmp_team_p const *entry;
3240   struct kmp_team_list_item *next;
3241 } kmp_team_list_item_t;
3242 typedef kmp_team_list_item_t *kmp_team_list_t;
3243 
3244 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3245     kmp_team_list_t list, // List of teams.
3246     kmp_team_p const *team // Team to add.
3247     ) {
3248 
3249   // List must terminate with item where both entry and next are NULL.
3250   // Team is added to the list only once.
3251   // List is sorted in ascending order by team id.
3252   // Team id is *not* a key.
3253 
3254   kmp_team_list_t l;
3255 
3256   KMP_DEBUG_ASSERT(list != NULL);
3257   if (team == NULL) {
3258     return;
3259   }
3260 
3261   __kmp_print_structure_team_accum(list, team->t.t_parent);
3262   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3263 
3264   // Search list for the team.
3265   l = list;
3266   while (l->next != NULL && l->entry != team) {
3267     l = l->next;
3268   }
3269   if (l->next != NULL) {
3270     return; // Team has been added before, exit.
3271   }
3272 
3273   // Team is not found. Search list again for insertion point.
3274   l = list;
3275   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3276     l = l->next;
3277   }
3278 
3279   // Insert team.
3280   {
3281     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3282         sizeof(kmp_team_list_item_t));
3283     *item = *l;
3284     l->entry = team;
3285     l->next = item;
3286   }
3287 }
3288 
3289 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3290 
3291                                        ) {
3292   __kmp_printf("%s", title);
3293   if (team != NULL) {
3294     __kmp_printf("%2x %p\n", team->t.t_id, team);
3295   } else {
3296     __kmp_printf(" - (nil)\n");
3297   }
3298 }
3299 
3300 static void __kmp_print_structure_thread(char const *title,
3301                                          kmp_info_p const *thread) {
3302   __kmp_printf("%s", title);
3303   if (thread != NULL) {
3304     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3305   } else {
3306     __kmp_printf(" - (nil)\n");
3307   }
3308 }
3309 
3310 void __kmp_print_structure(void) {
3311 
3312   kmp_team_list_t list;
3313 
3314   // Initialize list of teams.
3315   list =
3316       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3317   list->entry = NULL;
3318   list->next = NULL;
3319 
3320   __kmp_printf("\n------------------------------\nGlobal Thread "
3321                "Table\n------------------------------\n");
3322   {
3323     int gtid;
3324     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3325       __kmp_printf("%2d", gtid);
3326       if (__kmp_threads != NULL) {
3327         __kmp_printf(" %p", __kmp_threads[gtid]);
3328       }
3329       if (__kmp_root != NULL) {
3330         __kmp_printf(" %p", __kmp_root[gtid]);
3331       }
3332       __kmp_printf("\n");
3333     }
3334   }
3335 
3336   // Print out __kmp_threads array.
3337   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3338                "----------\n");
3339   if (__kmp_threads != NULL) {
3340     int gtid;
3341     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342       kmp_info_t const *thread = __kmp_threads[gtid];
3343       if (thread != NULL) {
3344         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3345         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3346         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3347         __kmp_print_structure_team("    Serial Team:  ",
3348                                    thread->th.th_serial_team);
3349         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3350         __kmp_print_structure_thread("    Master:       ",
3351                                      thread->th.th_team_master);
3352         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3353         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3354 #if OMP_40_ENABLED
3355         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3356 #endif
3357         __kmp_print_structure_thread("    Next in pool: ",
3358                                      thread->th.th_next_pool);
3359         __kmp_printf("\n");
3360         __kmp_print_structure_team_accum(list, thread->th.th_team);
3361         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3362       }
3363     }
3364   } else {
3365     __kmp_printf("Threads array is not allocated.\n");
3366   }
3367 
3368   // Print out __kmp_root array.
3369   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3370                "--------\n");
3371   if (__kmp_root != NULL) {
3372     int gtid;
3373     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3374       kmp_root_t const *root = __kmp_root[gtid];
3375       if (root != NULL) {
3376         __kmp_printf("GTID %2d %p:\n", gtid, root);
3377         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3378         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3379         __kmp_print_structure_thread("    Uber Thread:  ",
3380                                      root->r.r_uber_thread);
3381         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3382         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3383         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3384         __kmp_printf("\n");
3385         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3386         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3387       }
3388     }
3389   } else {
3390     __kmp_printf("Ubers array is not allocated.\n");
3391   }
3392 
3393   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3394                "--------\n");
3395   while (list->next != NULL) {
3396     kmp_team_p const *team = list->entry;
3397     int i;
3398     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3399     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3400     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3401     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3402     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3403     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3404     for (i = 0; i < team->t.t_nproc; ++i) {
3405       __kmp_printf("    Thread %2d:      ", i);
3406       __kmp_print_structure_thread("", team->t.t_threads[i]);
3407     }
3408     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3409     __kmp_printf("\n");
3410     list = list->next;
3411   }
3412 
3413   // Print out __kmp_thread_pool and __kmp_team_pool.
3414   __kmp_printf("\n------------------------------\nPools\n----------------------"
3415                "--------\n");
3416   __kmp_print_structure_thread("Thread pool:          ",
3417                                CCAST(kmp_info_t *, __kmp_thread_pool));
3418   __kmp_print_structure_team("Team pool:            ",
3419                              CCAST(kmp_team_t *, __kmp_team_pool));
3420   __kmp_printf("\n");
3421 
3422   // Free team list.
3423   while (list != NULL) {
3424     kmp_team_list_item_t *item = list;
3425     list = list->next;
3426     KMP_INTERNAL_FREE(item);
3427   }
3428 }
3429 
3430 #endif
3431 
3432 //---------------------------------------------------------------------------
3433 //  Stuff for per-thread fast random number generator
3434 //  Table of primes
3435 static const unsigned __kmp_primes[] = {
3436     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3437     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3438     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3439     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3440     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3441     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3442     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3443     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3444     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3445     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3446     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3447 
3448 //---------------------------------------------------------------------------
3449 //  __kmp_get_random: Get a random number using a linear congruential method.
3450 unsigned short __kmp_get_random(kmp_info_t *thread) {
3451   unsigned x = thread->th.th_x;
3452   unsigned short r = x >> 16;
3453 
3454   thread->th.th_x = x * thread->th.th_a + 1;
3455 
3456   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3457                 thread->th.th_info.ds.ds_tid, r));
3458 
3459   return r;
3460 }
3461 //--------------------------------------------------------
3462 // __kmp_init_random: Initialize a random number generator
3463 void __kmp_init_random(kmp_info_t *thread) {
3464   unsigned seed = thread->th.th_info.ds.ds_tid;
3465 
3466   thread->th.th_a =
3467       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3468   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3469   KA_TRACE(30,
3470            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3471 }
3472 
3473 #if KMP_OS_WINDOWS
3474 /* reclaim array entries for root threads that are already dead, returns number
3475  * reclaimed */
3476 static int __kmp_reclaim_dead_roots(void) {
3477   int i, r = 0;
3478 
3479   for (i = 0; i < __kmp_threads_capacity; ++i) {
3480     if (KMP_UBER_GTID(i) &&
3481         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3482         !__kmp_root[i]
3483              ->r.r_active) { // AC: reclaim only roots died in non-active state
3484       r += __kmp_unregister_root_other_thread(i);
3485     }
3486   }
3487   return r;
3488 }
3489 #endif
3490 
3491 /* This function attempts to create free entries in __kmp_threads and
3492    __kmp_root, and returns the number of free entries generated.
3493 
3494    For Windows* OS static library, the first mechanism used is to reclaim array
3495    entries for root threads that are already dead.
3496 
3497    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3498    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3499    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3500    threadprivate cache array has been created. Synchronization with
3501    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3502 
3503    After any dead root reclamation, if the clipping value allows array expansion
3504    to result in the generation of a total of nNeed free slots, the function does
3505    that expansion. If not, nothing is done beyond the possible initial root
3506    thread reclamation.
3507 
3508    If any argument is negative, the behavior is undefined. */
3509 static int __kmp_expand_threads(int nNeed) {
3510   int added = 0;
3511   int minimumRequiredCapacity;
3512   int newCapacity;
3513   kmp_info_t **newThreads;
3514   kmp_root_t **newRoot;
3515 
3516 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3517 // resizing __kmp_threads does not need additional protection if foreign
3518 // threads are present
3519 
3520 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3521   /* only for Windows static library */
3522   /* reclaim array entries for root threads that are already dead */
3523   added = __kmp_reclaim_dead_roots();
3524 
3525   if (nNeed) {
3526     nNeed -= added;
3527     if (nNeed < 0)
3528       nNeed = 0;
3529   }
3530 #endif
3531   if (nNeed <= 0)
3532     return added;
3533 
3534   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3535   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3536   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3537   // > __kmp_max_nth in one of two ways:
3538   //
3539   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3540   //    may not be resused by another thread, so we may need to increase
3541   //    __kmp_threads_capacity to __kmp_max_nth + 1.
3542   //
3543   // 2) New foreign root(s) are encountered.  We always register new foreign
3544   //    roots. This may cause a smaller # of threads to be allocated at
3545   //    subsequent parallel regions, but the worker threads hang around (and
3546   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3547   //
3548   // Anyway, that is the reason for moving the check to see if
3549   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3550   // instead of having it performed here. -BB
3551 
3552   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3553 
3554   /* compute expansion headroom to check if we can expand */
3555   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3556     /* possible expansion too small -- give up */
3557     return added;
3558   }
3559   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3560 
3561   newCapacity = __kmp_threads_capacity;
3562   do {
3563     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3564                                                           : __kmp_sys_max_nth;
3565   } while (newCapacity < minimumRequiredCapacity);
3566   newThreads = (kmp_info_t **)__kmp_allocate(
3567       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3568   newRoot =
3569       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3570   KMP_MEMCPY(newThreads, __kmp_threads,
3571              __kmp_threads_capacity * sizeof(kmp_info_t *));
3572   KMP_MEMCPY(newRoot, __kmp_root,
3573              __kmp_threads_capacity * sizeof(kmp_root_t *));
3574 
3575   kmp_info_t **temp_threads = __kmp_threads;
3576   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3577   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3578   __kmp_free(temp_threads);
3579   added += newCapacity - __kmp_threads_capacity;
3580   *(volatile int *)&__kmp_threads_capacity = newCapacity;
3581 
3582   if (newCapacity > __kmp_tp_capacity) {
3583     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3584     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3585       __kmp_threadprivate_resize_cache(newCapacity);
3586     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3587       *(volatile int *)&__kmp_tp_capacity = newCapacity;
3588     }
3589     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3590   }
3591 
3592   return added;
3593 }
3594 
3595 /* Register the current thread as a root thread and obtain our gtid. We must
3596    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3597    thread that calls from __kmp_do_serial_initialize() */
3598 int __kmp_register_root(int initial_thread) {
3599   kmp_info_t *root_thread;
3600   kmp_root_t *root;
3601   int gtid;
3602   int capacity;
3603   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3604   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3605   KMP_MB();
3606 
3607   /* 2007-03-02:
3608      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3609      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3610      work as expected -- it may return false (that means there is at least one
3611      empty slot in __kmp_threads array), but it is possible the only free slot
3612      is #0, which is reserved for initial thread and so cannot be used for this
3613      one. Following code workarounds this bug.
3614 
3615      However, right solution seems to be not reserving slot #0 for initial
3616      thread because:
3617      (1) there is no magic in slot #0,
3618      (2) we cannot detect initial thread reliably (the first thread which does
3619         serial initialization may be not a real initial thread).
3620   */
3621   capacity = __kmp_threads_capacity;
3622   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3623     --capacity;
3624   }
3625 
3626   /* see if there are too many threads */
3627   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3628     if (__kmp_tp_cached) {
3629       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3630                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3631                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3632     } else {
3633       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3634                   __kmp_msg_null);
3635     }
3636   }
3637 
3638   /* find an available thread slot */
3639   /* Don't reassign the zero slot since we need that to only be used by initial
3640      thread */
3641   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3642        gtid++)
3643     ;
3644   KA_TRACE(1,
3645            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3646   KMP_ASSERT(gtid < __kmp_threads_capacity);
3647 
3648   /* update global accounting */
3649   __kmp_all_nth++;
3650   TCW_4(__kmp_nth, __kmp_nth + 1);
3651 
3652   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3653   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3654   if (__kmp_adjust_gtid_mode) {
3655     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3656       if (TCR_4(__kmp_gtid_mode) != 2) {
3657         TCW_4(__kmp_gtid_mode, 2);
3658       }
3659     } else {
3660       if (TCR_4(__kmp_gtid_mode) != 1) {
3661         TCW_4(__kmp_gtid_mode, 1);
3662       }
3663     }
3664   }
3665 
3666 #ifdef KMP_ADJUST_BLOCKTIME
3667   /* Adjust blocktime to zero if necessary            */
3668   /* Middle initialization might not have occurred yet */
3669   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3670     if (__kmp_nth > __kmp_avail_proc) {
3671       __kmp_zero_bt = TRUE;
3672     }
3673   }
3674 #endif /* KMP_ADJUST_BLOCKTIME */
3675 
3676   /* setup this new hierarchy */
3677   if (!(root = __kmp_root[gtid])) {
3678     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3679     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3680   }
3681 
3682 #if KMP_STATS_ENABLED
3683   // Initialize stats as soon as possible (right after gtid assignment).
3684   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3685   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3686   KMP_SET_THREAD_STATE(SERIAL_REGION);
3687   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3688 #endif
3689   __kmp_initialize_root(root);
3690 
3691   /* setup new root thread structure */
3692   if (root->r.r_uber_thread) {
3693     root_thread = root->r.r_uber_thread;
3694   } else {
3695     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3696     if (__kmp_storage_map) {
3697       __kmp_print_thread_storage_map(root_thread, gtid);
3698     }
3699     root_thread->th.th_info.ds.ds_gtid = gtid;
3700 #if OMPT_SUPPORT
3701     root_thread->th.ompt_thread_info.thread_data.ptr = NULL;
3702 #endif
3703     root_thread->th.th_root = root;
3704     if (__kmp_env_consistency_check) {
3705       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3706     }
3707 #if USE_FAST_MEMORY
3708     __kmp_initialize_fast_memory(root_thread);
3709 #endif /* USE_FAST_MEMORY */
3710 
3711 #if KMP_USE_BGET
3712     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3713     __kmp_initialize_bget(root_thread);
3714 #endif
3715     __kmp_init_random(root_thread); // Initialize random number generator
3716   }
3717 
3718   /* setup the serial team held in reserve by the root thread */
3719   if (!root_thread->th.th_serial_team) {
3720     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3721     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3722     root_thread->th.th_serial_team =
3723         __kmp_allocate_team(root, 1, 1,
3724 #if OMPT_SUPPORT
3725                             ompt_data_none, // root parallel id
3726 #endif
3727 #if OMP_40_ENABLED
3728                             proc_bind_default,
3729 #endif
3730                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3731   }
3732   KMP_ASSERT(root_thread->th.th_serial_team);
3733   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3734                 root_thread->th.th_serial_team));
3735 
3736   /* drop root_thread into place */
3737   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3738 
3739   root->r.r_root_team->t.t_threads[0] = root_thread;
3740   root->r.r_hot_team->t.t_threads[0] = root_thread;
3741   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3742   // AC: the team created in reserve, not for execution (it is unused for now).
3743   root_thread->th.th_serial_team->t.t_serialized = 0;
3744   root->r.r_uber_thread = root_thread;
3745 
3746   /* initialize the thread, get it ready to go */
3747   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3748   TCW_4(__kmp_init_gtid, TRUE);
3749 
3750   /* prepare the master thread for get_gtid() */
3751   __kmp_gtid_set_specific(gtid);
3752 
3753 #if USE_ITT_BUILD
3754   __kmp_itt_thread_name(gtid);
3755 #endif /* USE_ITT_BUILD */
3756 
3757 #ifdef KMP_TDATA_GTID
3758   __kmp_gtid = gtid;
3759 #endif
3760   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3761   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3762 
3763   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3764                 "plain=%u\n",
3765                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3766                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3767                 KMP_INIT_BARRIER_STATE));
3768   { // Initialize barrier data.
3769     int b;
3770     for (b = 0; b < bs_last_barrier; ++b) {
3771       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3772 #if USE_DEBUGGER
3773       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3774 #endif
3775     }
3776   }
3777   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3778                    KMP_INIT_BARRIER_STATE);
3779 
3780 #if KMP_AFFINITY_SUPPORTED
3781 #if OMP_40_ENABLED
3782   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786 #endif
3787 
3788   if (TCR_4(__kmp_init_middle)) {
3789     __kmp_affinity_set_init_mask(gtid, TRUE);
3790   }
3791 #endif /* KMP_AFFINITY_SUPPORTED */
3792 
3793   __kmp_root_counter++;
3794 
3795 #if OMPT_SUPPORT
3796   if (!initial_thread && ompt_enabled.enabled) {
3797 
3798     ompt_thread_t *root_thread = ompt_get_thread();
3799 
3800     ompt_set_thread_state(root_thread, omp_state_overhead);
3801 
3802     if (ompt_enabled.ompt_callback_thread_begin) {
3803       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3804           ompt_thread_initial, __ompt_get_thread_data_internal());
3805     }
3806     ompt_data_t *task_data;
3807     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3808     if (ompt_enabled.ompt_callback_task_create) {
3809       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3810           NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3811       // initial task has nothing to return to
3812     }
3813 
3814     ompt_set_thread_state(root_thread, omp_state_work_serial);
3815   }
3816 #endif
3817 
3818   KMP_MB();
3819   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3820 
3821   return gtid;
3822 }
3823 
3824 #if KMP_NESTED_HOT_TEAMS
3825 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3826                                 const int max_level) {
3827   int i, n, nth;
3828   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3829   if (!hot_teams || !hot_teams[level].hot_team) {
3830     return 0;
3831   }
3832   KMP_DEBUG_ASSERT(level < max_level);
3833   kmp_team_t *team = hot_teams[level].hot_team;
3834   nth = hot_teams[level].hot_team_nth;
3835   n = nth - 1; // master is not freed
3836   if (level < max_level - 1) {
3837     for (i = 0; i < nth; ++i) {
3838       kmp_info_t *th = team->t.t_threads[i];
3839       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3840       if (i > 0 && th->th.th_hot_teams) {
3841         __kmp_free(th->th.th_hot_teams);
3842         th->th.th_hot_teams = NULL;
3843       }
3844     }
3845   }
3846   __kmp_free_team(root, team, NULL);
3847   return n;
3848 }
3849 #endif
3850 
3851 // Resets a root thread and clear its root and hot teams.
3852 // Returns the number of __kmp_threads entries directly and indirectly freed.
3853 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3854   kmp_team_t *root_team = root->r.r_root_team;
3855   kmp_team_t *hot_team = root->r.r_hot_team;
3856   int n = hot_team->t.t_nproc;
3857   int i;
3858 
3859   KMP_DEBUG_ASSERT(!root->r.r_active);
3860 
3861   root->r.r_root_team = NULL;
3862   root->r.r_hot_team = NULL;
3863   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3864   // before call to __kmp_free_team().
3865   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3866 #if KMP_NESTED_HOT_TEAMS
3867   if (__kmp_hot_teams_max_level >
3868       0) { // need to free nested hot teams and their threads if any
3869     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3870       kmp_info_t *th = hot_team->t.t_threads[i];
3871       if (__kmp_hot_teams_max_level > 1) {
3872         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3873       }
3874       if (th->th.th_hot_teams) {
3875         __kmp_free(th->th.th_hot_teams);
3876         th->th.th_hot_teams = NULL;
3877       }
3878     }
3879   }
3880 #endif
3881   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3882 
3883   // Before we can reap the thread, we need to make certain that all other
3884   // threads in the teams that had this root as ancestor have stopped trying to
3885   // steal tasks.
3886   if (__kmp_tasking_mode != tskm_immediate_exec) {
3887     __kmp_wait_to_unref_task_teams();
3888   }
3889 
3890 #if KMP_OS_WINDOWS
3891   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3892   KA_TRACE(
3893       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3894            "\n",
3895            (LPVOID) & (root->r.r_uber_thread->th),
3896            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3897   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3898 #endif /* KMP_OS_WINDOWS */
3899 
3900 #if OMPT_SUPPORT
3901   if (ompt_enabled.ompt_callback_thread_end) {
3902     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3903         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3904   }
3905 #endif
3906 
3907   TCW_4(__kmp_nth,
3908         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3909   root->r.r_cg_nthreads--;
3910 
3911   __kmp_reap_thread(root->r.r_uber_thread, 1);
3912 
3913   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3914   // of freeing.
3915   root->r.r_uber_thread = NULL;
3916   /* mark root as no longer in use */
3917   root->r.r_begin = FALSE;
3918 
3919   return n;
3920 }
3921 
3922 void __kmp_unregister_root_current_thread(int gtid) {
3923   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3924   /* this lock should be ok, since unregister_root_current_thread is never
3925      called during an abort, only during a normal close. furthermore, if you
3926      have the forkjoin lock, you should never try to get the initz lock */
3927   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3928   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3929     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3930                   "exiting T#%d\n",
3931                   gtid));
3932     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3933     return;
3934   }
3935   kmp_root_t *root = __kmp_root[gtid];
3936 
3937   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938   KMP_ASSERT(KMP_UBER_GTID(gtid));
3939   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940   KMP_ASSERT(root->r.r_active == FALSE);
3941 
3942   KMP_MB();
3943 
3944 #if OMP_45_ENABLED
3945   kmp_info_t *thread = __kmp_threads[gtid];
3946   kmp_team_t *team = thread->th.th_team;
3947   kmp_task_team_t *task_team = thread->th.th_task_team;
3948 
3949   // we need to wait for the proxy tasks before finishing the thread
3950   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3951 #if OMPT_SUPPORT
3952     // the runtime is shutting down so we won't report any events
3953     thread->th.ompt_thread_info.state = omp_state_undefined;
3954 #endif
3955     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3956   }
3957 #endif
3958 
3959   __kmp_reset_root(gtid, root);
3960 
3961   /* free up this thread slot */
3962   __kmp_gtid_set_specific(KMP_GTID_DNE);
3963 #ifdef KMP_TDATA_GTID
3964   __kmp_gtid = KMP_GTID_DNE;
3965 #endif
3966 
3967   KMP_MB();
3968   KC_TRACE(10,
3969            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3970 
3971   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3972 }
3973 
3974 #if KMP_OS_WINDOWS
3975 /* __kmp_forkjoin_lock must be already held
3976    Unregisters a root thread that is not the current thread.  Returns the number
3977    of __kmp_threads entries freed as a result. */
3978 static int __kmp_unregister_root_other_thread(int gtid) {
3979   kmp_root_t *root = __kmp_root[gtid];
3980   int r;
3981 
3982   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3983   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3984   KMP_ASSERT(KMP_UBER_GTID(gtid));
3985   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3986   KMP_ASSERT(root->r.r_active == FALSE);
3987 
3988   r = __kmp_reset_root(gtid, root);
3989   KC_TRACE(10,
3990            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3991   return r;
3992 }
3993 #endif
3994 
3995 #if KMP_DEBUG
3996 void __kmp_task_info() {
3997 
3998   kmp_int32 gtid = __kmp_entry_gtid();
3999   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4000   kmp_info_t *this_thr = __kmp_threads[gtid];
4001   kmp_team_t *steam = this_thr->th.th_serial_team;
4002   kmp_team_t *team = this_thr->th.th_team;
4003 
4004   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
4005                "ptask=%p\n",
4006                gtid, tid, this_thr, team, this_thr->th.th_current_task,
4007                team->t.t_implicit_task_taskdata[tid].td_parent);
4008 }
4009 #endif // KMP_DEBUG
4010 
4011 /* TODO optimize with one big memclr, take out what isn't needed, split
4012    responsibility to workers as much as possible, and delay initialization of
4013    features as much as possible  */
4014 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4015                                   int tid, int gtid) {
4016   /* this_thr->th.th_info.ds.ds_gtid is setup in
4017      kmp_allocate_thread/create_worker.
4018      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4019   kmp_info_t *master = team->t.t_threads[0];
4020   KMP_DEBUG_ASSERT(this_thr != NULL);
4021   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4022   KMP_DEBUG_ASSERT(team);
4023   KMP_DEBUG_ASSERT(team->t.t_threads);
4024   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4025   KMP_DEBUG_ASSERT(master);
4026   KMP_DEBUG_ASSERT(master->th.th_root);
4027 
4028   KMP_MB();
4029 
4030   TCW_SYNC_PTR(this_thr->th.th_team, team);
4031 
4032   this_thr->th.th_info.ds.ds_tid = tid;
4033   this_thr->th.th_set_nproc = 0;
4034   if (__kmp_tasking_mode != tskm_immediate_exec)
4035     // When tasking is possible, threads are not safe to reap until they are
4036     // done tasking; this will be set when tasking code is exited in wait
4037     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4038   else // no tasking --> always safe to reap
4039     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4040 #if OMP_40_ENABLED
4041   this_thr->th.th_set_proc_bind = proc_bind_default;
4042 #if KMP_AFFINITY_SUPPORTED
4043   this_thr->th.th_new_place = this_thr->th.th_current_place;
4044 #endif
4045 #endif
4046   this_thr->th.th_root = master->th.th_root;
4047 
4048   /* setup the thread's cache of the team structure */
4049   this_thr->th.th_team_nproc = team->t.t_nproc;
4050   this_thr->th.th_team_master = master;
4051   this_thr->th.th_team_serialized = team->t.t_serialized;
4052   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4053 
4054   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4055 
4056   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4057                 tid, gtid, this_thr, this_thr->th.th_current_task));
4058 
4059   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4060                            team, tid, TRUE);
4061 
4062   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4063                 tid, gtid, this_thr, this_thr->th.th_current_task));
4064   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4065   // __kmp_initialize_team()?
4066 
4067   /* TODO no worksharing in speculative threads */
4068   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4069 
4070   this_thr->th.th_local.this_construct = 0;
4071 
4072   if (!this_thr->th.th_pri_common) {
4073     this_thr->th.th_pri_common =
4074         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4075     if (__kmp_storage_map) {
4076       __kmp_print_storage_map_gtid(
4077           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4078           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4079     }
4080     this_thr->th.th_pri_head = NULL;
4081   }
4082 
4083   /* Initialize dynamic dispatch */
4084   {
4085     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4086     // Use team max_nproc since this will never change for the team.
4087     size_t disp_size =
4088         sizeof(dispatch_private_info_t) *
4089         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4090     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4091                   team->t.t_max_nproc));
4092     KMP_ASSERT(dispatch);
4093     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4094     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4095 
4096     dispatch->th_disp_index = 0;
4097 #if OMP_45_ENABLED
4098     dispatch->th_doacross_buf_idx = 0;
4099 #endif
4100     if (!dispatch->th_disp_buffer) {
4101       dispatch->th_disp_buffer =
4102           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4103 
4104       if (__kmp_storage_map) {
4105         __kmp_print_storage_map_gtid(
4106             gtid, &dispatch->th_disp_buffer[0],
4107             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4108                                           ? 1
4109                                           : __kmp_dispatch_num_buffers],
4110             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4111                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4112             gtid, team->t.t_id, gtid);
4113       }
4114     } else {
4115       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4116     }
4117 
4118     dispatch->th_dispatch_pr_current = 0;
4119     dispatch->th_dispatch_sh_current = 0;
4120 
4121     dispatch->th_deo_fcn = 0; /* ORDERED     */
4122     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4123   }
4124 
4125   this_thr->th.th_next_pool = NULL;
4126 
4127   if (!this_thr->th.th_task_state_memo_stack) {
4128     size_t i;
4129     this_thr->th.th_task_state_memo_stack =
4130         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4131     this_thr->th.th_task_state_top = 0;
4132     this_thr->th.th_task_state_stack_sz = 4;
4133     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4134          ++i) // zero init the stack
4135       this_thr->th.th_task_state_memo_stack[i] = 0;
4136   }
4137 
4138   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4139   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4140 
4141   KMP_MB();
4142 }
4143 
4144 /* allocate a new thread for the requesting team. this is only called from
4145    within a forkjoin critical section. we will first try to get an available
4146    thread from the thread pool. if none is available, we will fork a new one
4147    assuming we are able to create a new one. this should be assured, as the
4148    caller should check on this first. */
4149 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4150                                   int new_tid) {
4151   kmp_team_t *serial_team;
4152   kmp_info_t *new_thr;
4153   int new_gtid;
4154 
4155   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4156   KMP_DEBUG_ASSERT(root && team);
4157 #if !KMP_NESTED_HOT_TEAMS
4158   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4159 #endif
4160   KMP_MB();
4161 
4162   /* first, try to get one from the thread pool */
4163   if (__kmp_thread_pool) {
4164 
4165     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4166     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4167     if (new_thr == __kmp_thread_pool_insert_pt) {
4168       __kmp_thread_pool_insert_pt = NULL;
4169     }
4170     TCW_4(new_thr->th.th_in_pool, FALSE);
4171     // Don't touch th_active_in_pool or th_active.
4172     // The worker thread adjusts those flags as it sleeps/awakens.
4173     __kmp_thread_pool_nth--;
4174 
4175     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4176                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4177     KMP_ASSERT(!new_thr->th.th_team);
4178     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4179     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4180 
4181     /* setup the thread structure */
4182     __kmp_initialize_info(new_thr, team, new_tid,
4183                           new_thr->th.th_info.ds.ds_gtid);
4184     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4185 
4186     TCW_4(__kmp_nth, __kmp_nth + 1);
4187     root->r.r_cg_nthreads++;
4188 
4189     new_thr->th.th_task_state = 0;
4190     new_thr->th.th_task_state_top = 0;
4191     new_thr->th.th_task_state_stack_sz = 4;
4192 
4193 #ifdef KMP_ADJUST_BLOCKTIME
4194     /* Adjust blocktime back to zero if necessary */
4195     /* Middle initialization might not have occurred yet */
4196     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4197       if (__kmp_nth > __kmp_avail_proc) {
4198         __kmp_zero_bt = TRUE;
4199       }
4200     }
4201 #endif /* KMP_ADJUST_BLOCKTIME */
4202 
4203 #if KMP_DEBUG
4204     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4205     // KMP_BARRIER_PARENT_FLAG.
4206     int b;
4207     kmp_balign_t *balign = new_thr->th.th_bar;
4208     for (b = 0; b < bs_last_barrier; ++b)
4209       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4210 #endif
4211 
4212     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4213                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4214 
4215     KMP_MB();
4216     return new_thr;
4217   }
4218 
4219   /* no, well fork a new one */
4220   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4221   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4222 
4223 #if KMP_USE_MONITOR
4224   // If this is the first worker thread the RTL is creating, then also
4225   // launch the monitor thread.  We try to do this as early as possible.
4226   if (!TCR_4(__kmp_init_monitor)) {
4227     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4228     if (!TCR_4(__kmp_init_monitor)) {
4229       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4230       TCW_4(__kmp_init_monitor, 1);
4231       __kmp_create_monitor(&__kmp_monitor);
4232       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4233 #if KMP_OS_WINDOWS
4234       // AC: wait until monitor has started. This is a fix for CQ232808.
4235       // The reason is that if the library is loaded/unloaded in a loop with
4236       // small (parallel) work in between, then there is high probability that
4237       // monitor thread started after the library shutdown. At shutdown it is
4238       // too late to cope with the problem, because when the master is in
4239       // DllMain (process detach) the monitor has no chances to start (it is
4240       // blocked), and master has no means to inform the monitor that the
4241       // library has gone, because all the memory which the monitor can access
4242       // is going to be released/reset.
4243       while (TCR_4(__kmp_init_monitor) < 2) {
4244         KMP_YIELD(TRUE);
4245       }
4246       KF_TRACE(10, ("after monitor thread has started\n"));
4247 #endif
4248     }
4249     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4250   }
4251 #endif
4252 
4253   KMP_MB();
4254   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4255     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4256   }
4257 
4258   /* allocate space for it. */
4259   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4260 
4261   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4262 
4263   if (__kmp_storage_map) {
4264     __kmp_print_thread_storage_map(new_thr, new_gtid);
4265   }
4266 
4267   // add the reserve serialized team, initialized from the team's master thread
4268   {
4269     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4270     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4271     new_thr->th.th_serial_team = serial_team =
4272         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4273 #if OMPT_SUPPORT
4274                                           ompt_data_none, // root parallel id
4275 #endif
4276 #if OMP_40_ENABLED
4277                                           proc_bind_default,
4278 #endif
4279                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4280   }
4281   KMP_ASSERT(serial_team);
4282   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4283   // execution (it is unused for now).
4284   serial_team->t.t_threads[0] = new_thr;
4285   KF_TRACE(10,
4286            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4287             new_thr));
4288 
4289   /* setup the thread structures */
4290   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4291 
4292 #if USE_FAST_MEMORY
4293   __kmp_initialize_fast_memory(new_thr);
4294 #endif /* USE_FAST_MEMORY */
4295 
4296 #if KMP_USE_BGET
4297   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4298   __kmp_initialize_bget(new_thr);
4299 #endif
4300 
4301   __kmp_init_random(new_thr); // Initialize random number generator
4302 
4303   /* Initialize these only once when thread is grabbed for a team allocation */
4304   KA_TRACE(20,
4305            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4306             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4307 
4308   int b;
4309   kmp_balign_t *balign = new_thr->th.th_bar;
4310   for (b = 0; b < bs_last_barrier; ++b) {
4311     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4312     balign[b].bb.team = NULL;
4313     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4314     balign[b].bb.use_oncore_barrier = 0;
4315   }
4316 
4317   new_thr->th.th_spin_here = FALSE;
4318   new_thr->th.th_next_waiting = 0;
4319 
4320 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4321   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4322   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4323   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4324   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4325 #endif
4326 
4327   TCW_4(new_thr->th.th_in_pool, FALSE);
4328   new_thr->th.th_active_in_pool = FALSE;
4329   TCW_4(new_thr->th.th_active, TRUE);
4330 
4331   /* adjust the global counters */
4332   __kmp_all_nth++;
4333   __kmp_nth++;
4334 
4335   root->r.r_cg_nthreads++;
4336 
4337   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4338   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4339   if (__kmp_adjust_gtid_mode) {
4340     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4341       if (TCR_4(__kmp_gtid_mode) != 2) {
4342         TCW_4(__kmp_gtid_mode, 2);
4343       }
4344     } else {
4345       if (TCR_4(__kmp_gtid_mode) != 1) {
4346         TCW_4(__kmp_gtid_mode, 1);
4347       }
4348     }
4349   }
4350 
4351 #ifdef KMP_ADJUST_BLOCKTIME
4352   /* Adjust blocktime back to zero if necessary       */
4353   /* Middle initialization might not have occurred yet */
4354   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4355     if (__kmp_nth > __kmp_avail_proc) {
4356       __kmp_zero_bt = TRUE;
4357     }
4358   }
4359 #endif /* KMP_ADJUST_BLOCKTIME */
4360 
4361   /* actually fork it and create the new worker thread */
4362   KF_TRACE(
4363       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4364   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4365   KF_TRACE(10,
4366            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4367 
4368   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4369                 new_gtid));
4370   KMP_MB();
4371   return new_thr;
4372 }
4373 
4374 /* Reinitialize team for reuse.
4375    The hot team code calls this case at every fork barrier, so EPCC barrier
4376    test are extremely sensitive to changes in it, esp. writes to the team
4377    struct, which cause a cache invalidation in all threads.
4378    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4379 static void __kmp_reinitialize_team(kmp_team_t *team,
4380                                     kmp_internal_control_t *new_icvs,
4381                                     ident_t *loc) {
4382   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4383                 team->t.t_threads[0], team));
4384   KMP_DEBUG_ASSERT(team && new_icvs);
4385   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4386   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4387 
4388   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4389   // Copy ICVs to the master thread's implicit taskdata
4390   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4391   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4392 
4393   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4394                 team->t.t_threads[0], team));
4395 }
4396 
4397 /* Initialize the team data structure.
4398    This assumes the t_threads and t_max_nproc are already set.
4399    Also, we don't touch the arguments */
4400 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4401                                   kmp_internal_control_t *new_icvs,
4402                                   ident_t *loc) {
4403   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4404 
4405   /* verify */
4406   KMP_DEBUG_ASSERT(team);
4407   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4408   KMP_DEBUG_ASSERT(team->t.t_threads);
4409   KMP_MB();
4410 
4411   team->t.t_master_tid = 0; /* not needed */
4412   /* team->t.t_master_bar;        not needed */
4413   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4414   team->t.t_nproc = new_nproc;
4415 
4416   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4417   team->t.t_next_pool = NULL;
4418   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4419    * up hot team */
4420 
4421   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4422   team->t.t_invoke = NULL; /* not needed */
4423 
4424   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4425   team->t.t_sched.sched = new_icvs->sched.sched;
4426 
4427 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4428   team->t.t_fp_control_saved = FALSE; /* not needed */
4429   team->t.t_x87_fpu_control_word = 0; /* not needed */
4430   team->t.t_mxcsr = 0; /* not needed */
4431 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4432 
4433   team->t.t_construct = 0;
4434 
4435   team->t.t_ordered.dt.t_value = 0;
4436   team->t.t_master_active = FALSE;
4437 
4438   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4439 
4440 #ifdef KMP_DEBUG
4441   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4442 #endif
4443   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4444 
4445   team->t.t_control_stack_top = NULL;
4446 
4447   __kmp_reinitialize_team(team, new_icvs, loc);
4448 
4449   KMP_MB();
4450   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4451 }
4452 
4453 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4454 /* Sets full mask for thread and returns old mask, no changes to structures. */
4455 static void
4456 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4457   if (KMP_AFFINITY_CAPABLE()) {
4458     int status;
4459     if (old_mask != NULL) {
4460       status = __kmp_get_system_affinity(old_mask, TRUE);
4461       int error = errno;
4462       if (status != 0) {
4463         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4464                     __kmp_msg_null);
4465       }
4466     }
4467     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4468   }
4469 }
4470 #endif
4471 
4472 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4473 
4474 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4475 // It calculats the worker + master thread's partition based upon the parent
4476 // thread's partition, and binds each worker to a thread in their partition.
4477 // The master thread's partition should already include its current binding.
4478 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4479   // Copy the master thread's place partion to the team struct
4480   kmp_info_t *master_th = team->t.t_threads[0];
4481   KMP_DEBUG_ASSERT(master_th != NULL);
4482   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4483   int first_place = master_th->th.th_first_place;
4484   int last_place = master_th->th.th_last_place;
4485   int masters_place = master_th->th.th_current_place;
4486   team->t.t_first_place = first_place;
4487   team->t.t_last_place = last_place;
4488 
4489   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4490                 "bound to place %d partition = [%d,%d]\n",
4491                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4492                 team->t.t_id, masters_place, first_place, last_place));
4493 
4494   switch (proc_bind) {
4495 
4496   case proc_bind_default:
4497     // serial teams might have the proc_bind policy set to proc_bind_default. It
4498     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4499     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4500     break;
4501 
4502   case proc_bind_master: {
4503     int f;
4504     int n_th = team->t.t_nproc;
4505     for (f = 1; f < n_th; f++) {
4506       kmp_info_t *th = team->t.t_threads[f];
4507       KMP_DEBUG_ASSERT(th != NULL);
4508       th->th.th_first_place = first_place;
4509       th->th.th_last_place = last_place;
4510       th->th.th_new_place = masters_place;
4511 
4512       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4513                      "partition = [%d,%d]\n",
4514                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4515                      f, masters_place, first_place, last_place));
4516     }
4517   } break;
4518 
4519   case proc_bind_close: {
4520     int f;
4521     int n_th = team->t.t_nproc;
4522     int n_places;
4523     if (first_place <= last_place) {
4524       n_places = last_place - first_place + 1;
4525     } else {
4526       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4527     }
4528     if (n_th <= n_places) {
4529       int place = masters_place;
4530       for (f = 1; f < n_th; f++) {
4531         kmp_info_t *th = team->t.t_threads[f];
4532         KMP_DEBUG_ASSERT(th != NULL);
4533 
4534         if (place == last_place) {
4535           place = first_place;
4536         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4537           place = 0;
4538         } else {
4539           place++;
4540         }
4541         th->th.th_first_place = first_place;
4542         th->th.th_last_place = last_place;
4543         th->th.th_new_place = place;
4544 
4545         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4546                        "partition = [%d,%d]\n",
4547                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4548                        team->t.t_id, f, place, first_place, last_place));
4549       }
4550     } else {
4551       int S, rem, gap, s_count;
4552       S = n_th / n_places;
4553       s_count = 0;
4554       rem = n_th - (S * n_places);
4555       gap = rem > 0 ? n_places / rem : n_places;
4556       int place = masters_place;
4557       int gap_ct = gap;
4558       for (f = 0; f < n_th; f++) {
4559         kmp_info_t *th = team->t.t_threads[f];
4560         KMP_DEBUG_ASSERT(th != NULL);
4561 
4562         th->th.th_first_place = first_place;
4563         th->th.th_last_place = last_place;
4564         th->th.th_new_place = place;
4565         s_count++;
4566 
4567         if ((s_count == S) && rem && (gap_ct == gap)) {
4568           // do nothing, add an extra thread to place on next iteration
4569         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4570           // we added an extra thread to this place; move to next place
4571           if (place == last_place) {
4572             place = first_place;
4573           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4574             place = 0;
4575           } else {
4576             place++;
4577           }
4578           s_count = 0;
4579           gap_ct = 1;
4580           rem--;
4581         } else if (s_count == S) { // place full; don't add extra
4582           if (place == last_place) {
4583             place = first_place;
4584           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4585             place = 0;
4586           } else {
4587             place++;
4588           }
4589           gap_ct++;
4590           s_count = 0;
4591         }
4592 
4593         KA_TRACE(100,
4594                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4595                   "partition = [%d,%d]\n",
4596                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4597                   th->th.th_new_place, first_place, last_place));
4598       }
4599       KMP_DEBUG_ASSERT(place == masters_place);
4600     }
4601   } break;
4602 
4603   case proc_bind_spread: {
4604     int f;
4605     int n_th = team->t.t_nproc;
4606     int n_places;
4607     int thidx;
4608     if (first_place <= last_place) {
4609       n_places = last_place - first_place + 1;
4610     } else {
4611       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4612     }
4613     if (n_th <= n_places) {
4614       int place = -1;
4615 
4616       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4617         int S = n_places / n_th;
4618         int s_count, rem, gap, gap_ct;
4619 
4620         place = masters_place;
4621         rem = n_places - n_th * S;
4622         gap = rem ? n_th / rem : 1;
4623         gap_ct = gap;
4624         thidx = n_th;
4625         if (update_master_only == 1)
4626           thidx = 1;
4627         for (f = 0; f < thidx; f++) {
4628           kmp_info_t *th = team->t.t_threads[f];
4629           KMP_DEBUG_ASSERT(th != NULL);
4630 
4631           th->th.th_first_place = place;
4632           th->th.th_new_place = place;
4633           s_count = 1;
4634           while (s_count < S) {
4635             if (place == last_place) {
4636               place = first_place;
4637             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4638               place = 0;
4639             } else {
4640               place++;
4641             }
4642             s_count++;
4643           }
4644           if (rem && (gap_ct == gap)) {
4645             if (place == last_place) {
4646               place = first_place;
4647             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4648               place = 0;
4649             } else {
4650               place++;
4651             }
4652             rem--;
4653             gap_ct = 0;
4654           }
4655           th->th.th_last_place = place;
4656           gap_ct++;
4657 
4658           if (place == last_place) {
4659             place = first_place;
4660           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4661             place = 0;
4662           } else {
4663             place++;
4664           }
4665 
4666           KA_TRACE(100,
4667                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4668                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4669                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4670                     f, th->th.th_new_place, th->th.th_first_place,
4671                     th->th.th_last_place, __kmp_affinity_num_masks));
4672         }
4673       } else {
4674         /* Having uniform space of available computation places I can create
4675            T partitions of round(P/T) size and put threads into the first
4676            place of each partition. */
4677         double current = static_cast<double>(masters_place);
4678         double spacing =
4679             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4680         int first, last;
4681         kmp_info_t *th;
4682 
4683         thidx = n_th + 1;
4684         if (update_master_only == 1)
4685           thidx = 1;
4686         for (f = 0; f < thidx; f++) {
4687           first = static_cast<int>(current);
4688           last = static_cast<int>(current + spacing) - 1;
4689           KMP_DEBUG_ASSERT(last >= first);
4690           if (first >= n_places) {
4691             if (masters_place) {
4692               first -= n_places;
4693               last -= n_places;
4694               if (first == (masters_place + 1)) {
4695                 KMP_DEBUG_ASSERT(f == n_th);
4696                 first--;
4697               }
4698               if (last == masters_place) {
4699                 KMP_DEBUG_ASSERT(f == (n_th - 1));
4700                 last--;
4701               }
4702             } else {
4703               KMP_DEBUG_ASSERT(f == n_th);
4704               first = 0;
4705               last = 0;
4706             }
4707           }
4708           if (last >= n_places) {
4709             last = (n_places - 1);
4710           }
4711           place = first;
4712           current += spacing;
4713           if (f < n_th) {
4714             KMP_DEBUG_ASSERT(0 <= first);
4715             KMP_DEBUG_ASSERT(n_places > first);
4716             KMP_DEBUG_ASSERT(0 <= last);
4717             KMP_DEBUG_ASSERT(n_places > last);
4718             KMP_DEBUG_ASSERT(last_place >= first_place);
4719             th = team->t.t_threads[f];
4720             KMP_DEBUG_ASSERT(th);
4721             th->th.th_first_place = first;
4722             th->th.th_new_place = place;
4723             th->th.th_last_place = last;
4724 
4725             KA_TRACE(100,
4726                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4727                       "partition = [%d,%d], spacing = %.4f\n",
4728                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4729                       team->t.t_id, f, th->th.th_new_place,
4730                       th->th.th_first_place, th->th.th_last_place, spacing));
4731           }
4732         }
4733       }
4734       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4735     } else {
4736       int S, rem, gap, s_count;
4737       S = n_th / n_places;
4738       s_count = 0;
4739       rem = n_th - (S * n_places);
4740       gap = rem > 0 ? n_places / rem : n_places;
4741       int place = masters_place;
4742       int gap_ct = gap;
4743       thidx = n_th;
4744       if (update_master_only == 1)
4745         thidx = 1;
4746       for (f = 0; f < thidx; f++) {
4747         kmp_info_t *th = team->t.t_threads[f];
4748         KMP_DEBUG_ASSERT(th != NULL);
4749 
4750         th->th.th_first_place = place;
4751         th->th.th_last_place = place;
4752         th->th.th_new_place = place;
4753         s_count++;
4754 
4755         if ((s_count == S) && rem && (gap_ct == gap)) {
4756           // do nothing, add an extra thread to place on next iteration
4757         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4758           // we added an extra thread to this place; move on to next place
4759           if (place == last_place) {
4760             place = first_place;
4761           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4762             place = 0;
4763           } else {
4764             place++;
4765           }
4766           s_count = 0;
4767           gap_ct = 1;
4768           rem--;
4769         } else if (s_count == S) { // place is full; don't add extra thread
4770           if (place == last_place) {
4771             place = first_place;
4772           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4773             place = 0;
4774           } else {
4775             place++;
4776           }
4777           gap_ct++;
4778           s_count = 0;
4779         }
4780 
4781         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4782                        "partition = [%d,%d]\n",
4783                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4784                        team->t.t_id, f, th->th.th_new_place,
4785                        th->th.th_first_place, th->th.th_last_place));
4786       }
4787       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4788     }
4789   } break;
4790 
4791   default:
4792     break;
4793   }
4794 
4795   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4796 }
4797 
4798 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4799 
4800 /* allocate a new team data structure to use.  take one off of the free pool if
4801    available */
4802 kmp_team_t *
4803 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4804 #if OMPT_SUPPORT
4805                     ompt_data_t ompt_parallel_data,
4806 #endif
4807 #if OMP_40_ENABLED
4808                     kmp_proc_bind_t new_proc_bind,
4809 #endif
4810                     kmp_internal_control_t *new_icvs,
4811                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4812   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4813   int f;
4814   kmp_team_t *team;
4815   int use_hot_team = !root->r.r_active;
4816   int level = 0;
4817 
4818   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4819   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4820   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4821   KMP_MB();
4822 
4823 #if KMP_NESTED_HOT_TEAMS
4824   kmp_hot_team_ptr_t *hot_teams;
4825   if (master) {
4826     team = master->th.th_team;
4827     level = team->t.t_active_level;
4828     if (master->th.th_teams_microtask) { // in teams construct?
4829       if (master->th.th_teams_size.nteams > 1 &&
4830           ( // #teams > 1
4831               team->t.t_pkfn ==
4832                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4833               master->th.th_teams_level <
4834                   team->t.t_level)) { // or nested parallel inside the teams
4835         ++level; // not increment if #teams==1, or for outer fork of the teams;
4836         // increment otherwise
4837       }
4838     }
4839     hot_teams = master->th.th_hot_teams;
4840     if (level < __kmp_hot_teams_max_level && hot_teams &&
4841         hot_teams[level]
4842             .hot_team) { // hot team has already been allocated for given level
4843       use_hot_team = 1;
4844     } else {
4845       use_hot_team = 0;
4846     }
4847   }
4848 #endif
4849   // Optimization to use a "hot" team
4850   if (use_hot_team && new_nproc > 1) {
4851     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4852 #if KMP_NESTED_HOT_TEAMS
4853     team = hot_teams[level].hot_team;
4854 #else
4855     team = root->r.r_hot_team;
4856 #endif
4857 #if KMP_DEBUG
4858     if (__kmp_tasking_mode != tskm_immediate_exec) {
4859       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4860                     "task_team[1] = %p before reinit\n",
4861                     team->t.t_task_team[0], team->t.t_task_team[1]));
4862     }
4863 #endif
4864 
4865     // Has the number of threads changed?
4866     /* Let's assume the most common case is that the number of threads is
4867        unchanged, and put that case first. */
4868     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4869       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4870       // This case can mean that omp_set_num_threads() was called and the hot
4871       // team size was already reduced, so we check the special flag
4872       if (team->t.t_size_changed == -1) {
4873         team->t.t_size_changed = 1;
4874       } else {
4875         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4876       }
4877 
4878       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4879       kmp_r_sched_t new_sched = new_icvs->sched;
4880       // set master's schedule as new run-time schedule
4881       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4882 
4883       __kmp_reinitialize_team(team, new_icvs,
4884                               root->r.r_uber_thread->th.th_ident);
4885 
4886       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4887                     team->t.t_threads[0], team));
4888       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4889 
4890 #if OMP_40_ENABLED
4891 #if KMP_AFFINITY_SUPPORTED
4892       if ((team->t.t_size_changed == 0) &&
4893           (team->t.t_proc_bind == new_proc_bind)) {
4894         if (new_proc_bind == proc_bind_spread) {
4895           __kmp_partition_places(
4896               team, 1); // add flag to update only master for spread
4897         }
4898         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4899                        "proc_bind = %d, partition = [%d,%d]\n",
4900                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4901                        team->t.t_last_place));
4902       } else {
4903         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4904         __kmp_partition_places(team);
4905       }
4906 #else
4907       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4908 #endif /* KMP_AFFINITY_SUPPORTED */
4909 #endif /* OMP_40_ENABLED */
4910     } else if (team->t.t_nproc > new_nproc) {
4911       KA_TRACE(20,
4912                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4913                 new_nproc));
4914 
4915       team->t.t_size_changed = 1;
4916 #if KMP_NESTED_HOT_TEAMS
4917       if (__kmp_hot_teams_mode == 0) {
4918         // AC: saved number of threads should correspond to team's value in this
4919         // mode, can be bigger in mode 1, when hot team has threads in reserve
4920         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4921         hot_teams[level].hot_team_nth = new_nproc;
4922 #endif // KMP_NESTED_HOT_TEAMS
4923         /* release the extra threads we don't need any more */
4924         for (f = new_nproc; f < team->t.t_nproc; f++) {
4925           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4926           if (__kmp_tasking_mode != tskm_immediate_exec) {
4927             // When decreasing team size, threads no longer in the team should
4928             // unref task team.
4929             team->t.t_threads[f]->th.th_task_team = NULL;
4930           }
4931           __kmp_free_thread(team->t.t_threads[f]);
4932           team->t.t_threads[f] = NULL;
4933         }
4934 #if KMP_NESTED_HOT_TEAMS
4935       } // (__kmp_hot_teams_mode == 0)
4936       else {
4937         // When keeping extra threads in team, switch threads to wait on own
4938         // b_go flag
4939         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4940           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4941           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4942           for (int b = 0; b < bs_last_barrier; ++b) {
4943             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4944               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4945             }
4946             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4947           }
4948         }
4949       }
4950 #endif // KMP_NESTED_HOT_TEAMS
4951       team->t.t_nproc = new_nproc;
4952       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4953       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4954       __kmp_reinitialize_team(team, new_icvs,
4955                               root->r.r_uber_thread->th.th_ident);
4956 
4957       /* update the remaining threads */
4958       for (f = 0; f < new_nproc; ++f) {
4959         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4960       }
4961       // restore the current task state of the master thread: should be the
4962       // implicit task
4963       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4964                     team->t.t_threads[0], team));
4965 
4966       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4967 
4968 #ifdef KMP_DEBUG
4969       for (f = 0; f < team->t.t_nproc; f++) {
4970         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4971                          team->t.t_threads[f]->th.th_team_nproc ==
4972                              team->t.t_nproc);
4973       }
4974 #endif
4975 
4976 #if OMP_40_ENABLED
4977       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4978 #if KMP_AFFINITY_SUPPORTED
4979       __kmp_partition_places(team);
4980 #endif
4981 #endif
4982     } else { // team->t.t_nproc < new_nproc
4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984       kmp_affin_mask_t *old_mask;
4985       if (KMP_AFFINITY_CAPABLE()) {
4986         KMP_CPU_ALLOC(old_mask);
4987       }
4988 #endif
4989 
4990       KA_TRACE(20,
4991                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4992                 new_nproc));
4993 
4994       team->t.t_size_changed = 1;
4995 
4996 #if KMP_NESTED_HOT_TEAMS
4997       int avail_threads = hot_teams[level].hot_team_nth;
4998       if (new_nproc < avail_threads)
4999         avail_threads = new_nproc;
5000       kmp_info_t **other_threads = team->t.t_threads;
5001       for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002         // Adjust barrier data of reserved threads (if any) of the team
5003         // Other data will be set in __kmp_initialize_info() below.
5004         int b;
5005         kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006         for (b = 0; b < bs_last_barrier; ++b) {
5007           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5009 #if USE_DEBUGGER
5010           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5011 #endif
5012         }
5013       }
5014       if (hot_teams[level].hot_team_nth >= new_nproc) {
5015         // we have all needed threads in reserve, no need to allocate any
5016         // this only possible in mode 1, cannot have reserved threads in mode 0
5017         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018         team->t.t_nproc = new_nproc; // just get reserved threads involved
5019       } else {
5020         // we may have some threads in reserve, but not enough
5021         team->t.t_nproc =
5022             hot_teams[level]
5023                 .hot_team_nth; // get reserved threads involved if any
5024         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025 #endif // KMP_NESTED_HOT_TEAMS
5026         if (team->t.t_max_nproc < new_nproc) {
5027           /* reallocate larger arrays */
5028           __kmp_reallocate_team_arrays(team, new_nproc);
5029           __kmp_reinitialize_team(team, new_icvs, NULL);
5030         }
5031 
5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033         /* Temporarily set full mask for master thread before creation of
5034            workers. The reason is that workers inherit the affinity from master,
5035            so if a lot of workers are created on the single core quickly, they
5036            don't get a chance to set their own affinity for a long time. */
5037         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5038 #endif
5039 
5040         /* allocate new threads for the hot team */
5041         for (f = team->t.t_nproc; f < new_nproc; f++) {
5042           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043           KMP_DEBUG_ASSERT(new_worker);
5044           team->t.t_threads[f] = new_worker;
5045 
5046           KA_TRACE(20,
5047                    ("__kmp_allocate_team: team %d init T#%d arrived: "
5048                     "join=%llu, plain=%llu\n",
5049                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051                     team->t.t_bar[bs_plain_barrier].b_arrived));
5052 
5053           { // Initialize barrier data for new threads.
5054             int b;
5055             kmp_balign_t *balign = new_worker->th.th_bar;
5056             for (b = 0; b < bs_last_barrier; ++b) {
5057               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059                                KMP_BARRIER_PARENT_FLAG);
5060 #if USE_DEBUGGER
5061               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5062 #endif
5063             }
5064           }
5065         }
5066 
5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068         if (KMP_AFFINITY_CAPABLE()) {
5069           /* Restore initial master thread's affinity mask */
5070           __kmp_set_system_affinity(old_mask, TRUE);
5071           KMP_CPU_FREE(old_mask);
5072         }
5073 #endif
5074 #if KMP_NESTED_HOT_TEAMS
5075       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076 #endif // KMP_NESTED_HOT_TEAMS
5077       /* make sure everyone is syncronized */
5078       int old_nproc = team->t.t_nproc; // save old value and use to update only
5079       // new threads below
5080       __kmp_initialize_team(team, new_nproc, new_icvs,
5081                             root->r.r_uber_thread->th.th_ident);
5082 
5083       /* reinitialize the threads */
5084       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085       for (f = 0; f < team->t.t_nproc; ++f)
5086         __kmp_initialize_info(team->t.t_threads[f], team, f,
5087                               __kmp_gtid_from_tid(f, team));
5088       if (level) { // set th_task_state for new threads in nested hot team
5089         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5090         // only need to set the th_task_state for the new threads. th_task_state
5091         // for master thread will not be accurate until after this in
5092         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5093         // correct value.
5094         for (f = old_nproc; f < team->t.t_nproc; ++f)
5095           team->t.t_threads[f]->th.th_task_state =
5096               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5097       } else { // set th_task_state for new threads in non-nested hot team
5098         int old_state =
5099             team->t.t_threads[0]->th.th_task_state; // copy master's state
5100         for (f = old_nproc; f < team->t.t_nproc; ++f)
5101           team->t.t_threads[f]->th.th_task_state = old_state;
5102       }
5103 
5104 #ifdef KMP_DEBUG
5105       for (f = 0; f < team->t.t_nproc; ++f) {
5106         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5107                          team->t.t_threads[f]->th.th_team_nproc ==
5108                              team->t.t_nproc);
5109       }
5110 #endif
5111 
5112 #if OMP_40_ENABLED
5113       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114 #if KMP_AFFINITY_SUPPORTED
5115       __kmp_partition_places(team);
5116 #endif
5117 #endif
5118     } // Check changes in number of threads
5119 
5120 #if OMP_40_ENABLED
5121     kmp_info_t *master = team->t.t_threads[0];
5122     if (master->th.th_teams_microtask) {
5123       for (f = 1; f < new_nproc; ++f) {
5124         // propagate teams construct specific info to workers
5125         kmp_info_t *thr = team->t.t_threads[f];
5126         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5127         thr->th.th_teams_level = master->th.th_teams_level;
5128         thr->th.th_teams_size = master->th.th_teams_size;
5129       }
5130     }
5131 #endif /* OMP_40_ENABLED */
5132 #if KMP_NESTED_HOT_TEAMS
5133     if (level) {
5134       // Sync barrier state for nested hot teams, not needed for outermost hot
5135       // team.
5136       for (f = 1; f < new_nproc; ++f) {
5137         kmp_info_t *thr = team->t.t_threads[f];
5138         int b;
5139         kmp_balign_t *balign = thr->th.th_bar;
5140         for (b = 0; b < bs_last_barrier; ++b) {
5141           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5142           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5143 #if USE_DEBUGGER
5144           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5145 #endif
5146         }
5147       }
5148     }
5149 #endif // KMP_NESTED_HOT_TEAMS
5150 
5151     /* reallocate space for arguments if necessary */
5152     __kmp_alloc_argv_entries(argc, team, TRUE);
5153     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5154     // The hot team re-uses the previous task team,
5155     // if untouched during the previous release->gather phase.
5156 
5157     KF_TRACE(10, (" hot_team = %p\n", team));
5158 
5159 #if KMP_DEBUG
5160     if (__kmp_tasking_mode != tskm_immediate_exec) {
5161       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5162                     "task_team[1] = %p after reinit\n",
5163                     team->t.t_task_team[0], team->t.t_task_team[1]));
5164     }
5165 #endif
5166 
5167 #if OMPT_SUPPORT
5168     __ompt_team_assign_id(team, ompt_parallel_data);
5169 #endif
5170 
5171     KMP_MB();
5172 
5173     return team;
5174   }
5175 
5176   /* next, let's try to take one from the team pool */
5177   KMP_MB();
5178   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5179     /* TODO: consider resizing undersized teams instead of reaping them, now
5180        that we have a resizing mechanism */
5181     if (team->t.t_max_nproc >= max_nproc) {
5182       /* take this team from the team pool */
5183       __kmp_team_pool = team->t.t_next_pool;
5184 
5185       /* setup the team for fresh use */
5186       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5187 
5188       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5189                     "task_team[1] %p to NULL\n",
5190                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5191       team->t.t_task_team[0] = NULL;
5192       team->t.t_task_team[1] = NULL;
5193 
5194       /* reallocate space for arguments if necessary */
5195       __kmp_alloc_argv_entries(argc, team, TRUE);
5196       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5197 
5198       KA_TRACE(
5199           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5200                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5201       { // Initialize barrier data.
5202         int b;
5203         for (b = 0; b < bs_last_barrier; ++b) {
5204           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5205 #if USE_DEBUGGER
5206           team->t.t_bar[b].b_master_arrived = 0;
5207           team->t.t_bar[b].b_team_arrived = 0;
5208 #endif
5209         }
5210       }
5211 
5212 #if OMP_40_ENABLED
5213       team->t.t_proc_bind = new_proc_bind;
5214 #endif
5215 
5216       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5217                     team->t.t_id));
5218 
5219 #if OMPT_SUPPORT
5220       __ompt_team_assign_id(team, ompt_parallel_data);
5221 #endif
5222 
5223       KMP_MB();
5224 
5225       return team;
5226     }
5227 
5228     /* reap team if it is too small, then loop back and check the next one */
5229     // not sure if this is wise, but, will be redone during the hot-teams
5230     // rewrite.
5231     /* TODO: Use technique to find the right size hot-team, don't reap them */
5232     team = __kmp_reap_team(team);
5233     __kmp_team_pool = team;
5234   }
5235 
5236   /* nothing available in the pool, no matter, make a new team! */
5237   KMP_MB();
5238   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5239 
5240   /* and set it up */
5241   team->t.t_max_nproc = max_nproc;
5242   /* NOTE well, for some reason allocating one big buffer and dividing it up
5243      seems to really hurt performance a lot on the P4, so, let's not use this */
5244   __kmp_allocate_team_arrays(team, max_nproc);
5245 
5246   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5247   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5248 
5249   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5250                 "%p to NULL\n",
5251                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5252   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5253   // memory, no need to duplicate
5254   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5255   // memory, no need to duplicate
5256 
5257   if (__kmp_storage_map) {
5258     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5259   }
5260 
5261   /* allocate space for arguments */
5262   __kmp_alloc_argv_entries(argc, team, FALSE);
5263   team->t.t_argc = argc;
5264 
5265   KA_TRACE(20,
5266            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5267             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5268   { // Initialize barrier data.
5269     int b;
5270     for (b = 0; b < bs_last_barrier; ++b) {
5271       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5272 #if USE_DEBUGGER
5273       team->t.t_bar[b].b_master_arrived = 0;
5274       team->t.t_bar[b].b_team_arrived = 0;
5275 #endif
5276     }
5277   }
5278 
5279 #if OMP_40_ENABLED
5280   team->t.t_proc_bind = new_proc_bind;
5281 #endif
5282 
5283 #if OMPT_SUPPORT
5284   __ompt_team_assign_id(team, ompt_parallel_data);
5285   team->t.ompt_serialized_team_info = NULL;
5286 #endif
5287 
5288   KMP_MB();
5289 
5290   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5291                 team->t.t_id));
5292 
5293   return team;
5294 }
5295 
5296 /* TODO implement hot-teams at all levels */
5297 /* TODO implement lazy thread release on demand (disband request) */
5298 
5299 /* free the team.  return it to the team pool.  release all the threads
5300  * associated with it */
5301 void __kmp_free_team(kmp_root_t *root,
5302                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5303   int f;
5304   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5305                 team->t.t_id));
5306 
5307   /* verify state */
5308   KMP_DEBUG_ASSERT(root);
5309   KMP_DEBUG_ASSERT(team);
5310   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5311   KMP_DEBUG_ASSERT(team->t.t_threads);
5312 
5313   int use_hot_team = team == root->r.r_hot_team;
5314 #if KMP_NESTED_HOT_TEAMS
5315   int level;
5316   kmp_hot_team_ptr_t *hot_teams;
5317   if (master) {
5318     level = team->t.t_active_level - 1;
5319     if (master->th.th_teams_microtask) { // in teams construct?
5320       if (master->th.th_teams_size.nteams > 1) {
5321         ++level; // level was not increased in teams construct for
5322         // team_of_masters
5323       }
5324       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5325           master->th.th_teams_level == team->t.t_level) {
5326         ++level; // level was not increased in teams construct for
5327         // team_of_workers before the parallel
5328       } // team->t.t_level will be increased inside parallel
5329     }
5330     hot_teams = master->th.th_hot_teams;
5331     if (level < __kmp_hot_teams_max_level) {
5332       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5333       use_hot_team = 1;
5334     }
5335   }
5336 #endif // KMP_NESTED_HOT_TEAMS
5337 
5338   /* team is done working */
5339   TCW_SYNC_PTR(team->t.t_pkfn,
5340                NULL); // Important for Debugging Support Library.
5341   team->t.t_copyin_counter = 0; // init counter for possible reuse
5342   // Do not reset pointer to parent team to NULL for hot teams.
5343 
5344   /* if we are non-hot team, release our threads */
5345   if (!use_hot_team) {
5346     if (__kmp_tasking_mode != tskm_immediate_exec) {
5347       // Wait for threads to reach reapable state
5348       for (f = 1; f < team->t.t_nproc; ++f) {
5349         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5350         kmp_info_t *th = team->t.t_threads[f];
5351         volatile kmp_uint32 *state = &th->th.th_reap_state;
5352         while (*state != KMP_SAFE_TO_REAP) {
5353 #if KMP_OS_WINDOWS
5354           // On Windows a thread can be killed at any time, check this
5355           DWORD ecode;
5356           if (!__kmp_is_thread_alive(th, &ecode)) {
5357             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5358             break;
5359           }
5360 #endif
5361           // first check if thread is sleeping
5362           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5363           if (fl.is_sleeping())
5364             fl.resume(__kmp_gtid_from_thread(th));
5365           KMP_CPU_PAUSE();
5366         }
5367       }
5368 
5369       // Delete task teams
5370       int tt_idx;
5371       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5372         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5373         if (task_team != NULL) {
5374           for (f = 0; f < team->t.t_nproc;
5375                ++f) { // Have all threads unref task teams
5376             team->t.t_threads[f]->th.th_task_team = NULL;
5377           }
5378           KA_TRACE(
5379               20,
5380               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5381                __kmp_get_gtid(), task_team, team->t.t_id));
5382 #if KMP_NESTED_HOT_TEAMS
5383           __kmp_free_task_team(master, task_team);
5384 #endif
5385           team->t.t_task_team[tt_idx] = NULL;
5386         }
5387       }
5388     }
5389 
5390     // Reset pointer to parent team only for non-hot teams.
5391     team->t.t_parent = NULL;
5392     team->t.t_level = 0;
5393     team->t.t_active_level = 0;
5394 
5395     /* free the worker threads */
5396     for (f = 1; f < team->t.t_nproc; ++f) {
5397       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5398       __kmp_free_thread(team->t.t_threads[f]);
5399       team->t.t_threads[f] = NULL;
5400     }
5401 
5402     /* put the team back in the team pool */
5403     /* TODO limit size of team pool, call reap_team if pool too large */
5404     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5405     __kmp_team_pool = (volatile kmp_team_t *)team;
5406   }
5407 
5408   KMP_MB();
5409 }
5410 
5411 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5412 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5413   kmp_team_t *next_pool = team->t.t_next_pool;
5414 
5415   KMP_DEBUG_ASSERT(team);
5416   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5417   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5418   KMP_DEBUG_ASSERT(team->t.t_threads);
5419   KMP_DEBUG_ASSERT(team->t.t_argv);
5420 
5421   /* TODO clean the threads that are a part of this? */
5422 
5423   /* free stuff */
5424   __kmp_free_team_arrays(team);
5425   if (team->t.t_argv != &team->t.t_inline_argv[0])
5426     __kmp_free((void *)team->t.t_argv);
5427   __kmp_free(team);
5428 
5429   KMP_MB();
5430   return next_pool;
5431 }
5432 
5433 // Free the thread.  Don't reap it, just place it on the pool of available
5434 // threads.
5435 //
5436 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5437 // binding for the affinity mechanism to be useful.
5438 //
5439 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5440 // However, we want to avoid a potential performance problem by always
5441 // scanning through the list to find the correct point at which to insert
5442 // the thread (potential N**2 behavior).  To do this we keep track of the
5443 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5444 // With single-level parallelism, threads will always be added to the tail
5445 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5446 // parallelism, all bets are off and we may need to scan through the entire
5447 // free list.
5448 //
5449 // This change also has a potentially large performance benefit, for some
5450 // applications.  Previously, as threads were freed from the hot team, they
5451 // would be placed back on the free list in inverse order.  If the hot team
5452 // grew back to it's original size, then the freed thread would be placed
5453 // back on the hot team in reverse order.  This could cause bad cache
5454 // locality problems on programs where the size of the hot team regularly
5455 // grew and shrunk.
5456 //
5457 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5458 void __kmp_free_thread(kmp_info_t *this_th) {
5459   int gtid;
5460   kmp_info_t **scan;
5461   kmp_root_t *root = this_th->th.th_root;
5462 
5463   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5464                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5465 
5466   KMP_DEBUG_ASSERT(this_th);
5467 
5468   // When moving thread to pool, switch thread to wait on own b_go flag, and
5469   // uninitialized (NULL team).
5470   int b;
5471   kmp_balign_t *balign = this_th->th.th_bar;
5472   for (b = 0; b < bs_last_barrier; ++b) {
5473     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5474       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5475     balign[b].bb.team = NULL;
5476     balign[b].bb.leaf_kids = 0;
5477   }
5478   this_th->th.th_task_state = 0;
5479 
5480   /* put thread back on the free pool */
5481   TCW_PTR(this_th->th.th_team, NULL);
5482   TCW_PTR(this_th->th.th_root, NULL);
5483   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5484 
5485   /* If the implicit task assigned to this thread can be used by other threads
5486    * -> multiple threads can share the data and try to free the task at
5487    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5488    * with higher probability when hot team is disabled but can occurs even when
5489    * the hot team is enabled */
5490   __kmp_free_implicit_task(this_th);
5491   this_th->th.th_current_task = NULL;
5492 
5493   // If the __kmp_thread_pool_insert_pt is already past the new insert
5494   // point, then we need to re-scan the entire list.
5495   gtid = this_th->th.th_info.ds.ds_gtid;
5496   if (__kmp_thread_pool_insert_pt != NULL) {
5497     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5498     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5499       __kmp_thread_pool_insert_pt = NULL;
5500     }
5501   }
5502 
5503   // Scan down the list to find the place to insert the thread.
5504   // scan is the address of a link in the list, possibly the address of
5505   // __kmp_thread_pool itself.
5506   //
5507   // In the absence of nested parallism, the for loop will have 0 iterations.
5508   if (__kmp_thread_pool_insert_pt != NULL) {
5509     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5510   } else {
5511     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5512   }
5513   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5514        scan = &((*scan)->th.th_next_pool))
5515     ;
5516 
5517   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5518   // to its address.
5519   TCW_PTR(this_th->th.th_next_pool, *scan);
5520   __kmp_thread_pool_insert_pt = *scan = this_th;
5521   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5522                    (this_th->th.th_info.ds.ds_gtid <
5523                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5524   TCW_4(this_th->th.th_in_pool, TRUE);
5525   __kmp_thread_pool_nth++;
5526 
5527   TCW_4(__kmp_nth, __kmp_nth - 1);
5528   root->r.r_cg_nthreads--;
5529 
5530 #ifdef KMP_ADJUST_BLOCKTIME
5531   /* Adjust blocktime back to user setting or default if necessary */
5532   /* Middle initialization might never have occurred                */
5533   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5534     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5535     if (__kmp_nth <= __kmp_avail_proc) {
5536       __kmp_zero_bt = FALSE;
5537     }
5538   }
5539 #endif /* KMP_ADJUST_BLOCKTIME */
5540 
5541   KMP_MB();
5542 }
5543 
5544 /* ------------------------------------------------------------------------ */
5545 
5546 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5547   int gtid = this_thr->th.th_info.ds.ds_gtid;
5548   /*    void                 *stack_data;*/
5549   kmp_team_t *(*volatile pteam);
5550 
5551   KMP_MB();
5552   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5553 
5554   if (__kmp_env_consistency_check) {
5555     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5556   }
5557 
5558 #if OMPT_SUPPORT
5559   ompt_data_t *thread_data;
5560   if (ompt_enabled.enabled) {
5561     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5562     thread_data->ptr = NULL;
5563 
5564     this_thr->th.ompt_thread_info.state = omp_state_overhead;
5565     this_thr->th.ompt_thread_info.wait_id = 0;
5566     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5567     if (ompt_enabled.ompt_callback_thread_begin) {
5568       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5569           ompt_thread_worker, thread_data);
5570     }
5571   }
5572 #endif
5573 
5574 #if OMPT_SUPPORT
5575   if (ompt_enabled.enabled) {
5576     this_thr->th.ompt_thread_info.state = omp_state_idle;
5577   }
5578 #endif
5579   /* This is the place where threads wait for work */
5580   while (!TCR_4(__kmp_global.g.g_done)) {
5581     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5582     KMP_MB();
5583 
5584     /* wait for work to do */
5585     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5586 
5587     /* No tid yet since not part of a team */
5588     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5589 
5590 #if OMPT_SUPPORT
5591     if (ompt_enabled.enabled) {
5592       this_thr->th.ompt_thread_info.state = omp_state_overhead;
5593     }
5594 #endif
5595 
5596     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5597 
5598     /* have we been allocated? */
5599     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5600       /* we were just woken up, so run our new task */
5601       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5602         int rc;
5603         KA_TRACE(20,
5604                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5605                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5606                   (*pteam)->t.t_pkfn));
5607 
5608         updateHWFPControl(*pteam);
5609 
5610 #if OMPT_SUPPORT
5611         if (ompt_enabled.enabled) {
5612           this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
5613         }
5614 #endif
5615 
5616         {
5617           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5618           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5619           rc = (*pteam)->t.t_invoke(gtid);
5620         }
5621         KMP_ASSERT(rc);
5622 
5623         KMP_MB();
5624         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5625                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5626                       (*pteam)->t.t_pkfn));
5627       }
5628 #if OMPT_SUPPORT
5629       if (ompt_enabled.enabled) {
5630         /* no frame set while outside task */
5631         __ompt_get_task_info_object(0)->frame.exit_frame = NULL;
5632 
5633         this_thr->th.ompt_thread_info.state = omp_state_overhead;
5634       }
5635 #endif
5636       /* join barrier after parallel region */
5637       __kmp_join_barrier(gtid);
5638     }
5639   }
5640   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5641 
5642 #if OMPT_SUPPORT
5643   if (ompt_enabled.ompt_callback_thread_end) {
5644     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5645   }
5646 #endif
5647 
5648   this_thr->th.th_task_team = NULL;
5649   /* run the destructors for the threadprivate data for this thread */
5650   __kmp_common_destroy_gtid(gtid);
5651 
5652   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5653   KMP_MB();
5654   return this_thr;
5655 }
5656 
5657 /* ------------------------------------------------------------------------ */
5658 
5659 void __kmp_internal_end_dest(void *specific_gtid) {
5660 #if KMP_COMPILER_ICC
5661 #pragma warning(push)
5662 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5663 // significant bits
5664 #endif
5665   // Make sure no significant bits are lost
5666   int gtid = (kmp_intptr_t)specific_gtid - 1;
5667 #if KMP_COMPILER_ICC
5668 #pragma warning(pop)
5669 #endif
5670 
5671   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5672   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5673    * this is because 0 is reserved for the nothing-stored case */
5674 
5675   /* josh: One reason for setting the gtid specific data even when it is being
5676      destroyed by pthread is to allow gtid lookup through thread specific data
5677      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5678      that gets executed in the call to __kmp_internal_end_thread, actually
5679      gets the gtid through the thread specific data.  Setting it here seems
5680      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5681      to run smoothly.
5682      todo: get rid of this after we remove the dependence on
5683      __kmp_gtid_get_specific  */
5684   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5685     __kmp_gtid_set_specific(gtid);
5686 #ifdef KMP_TDATA_GTID
5687   __kmp_gtid = gtid;
5688 #endif
5689   __kmp_internal_end_thread(gtid);
5690 }
5691 
5692 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5693 
5694 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5695 // destructors work perfectly, but in real libomp.so I have no evidence it is
5696 // ever called. However, -fini linker option in makefile.mk works fine.
5697 
5698 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5699   __kmp_internal_end_atexit();
5700 }
5701 
5702 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5703 
5704 #endif
5705 
5706 /* [Windows] josh: when the atexit handler is called, there may still be more
5707    than one thread alive */
5708 void __kmp_internal_end_atexit(void) {
5709   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5710   /* [Windows]
5711      josh: ideally, we want to completely shutdown the library in this atexit
5712      handler, but stat code that depends on thread specific data for gtid fails
5713      because that data becomes unavailable at some point during the shutdown, so
5714      we call __kmp_internal_end_thread instead. We should eventually remove the
5715      dependency on __kmp_get_specific_gtid in the stat code and use
5716      __kmp_internal_end_library to cleanly shutdown the library.
5717 
5718      // TODO: Can some of this comment about GVS be removed?
5719      I suspect that the offending stat code is executed when the calling thread
5720      tries to clean up a dead root thread's data structures, resulting in GVS
5721      code trying to close the GVS structures for that thread, but since the stat
5722      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5723      the calling thread is cleaning up itself instead of another thread, it get
5724      confused. This happens because allowing a thread to unregister and cleanup
5725      another thread is a recent modification for addressing an issue.
5726      Based on the current design (20050722), a thread may end up
5727      trying to unregister another thread only if thread death does not trigger
5728      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5729      thread specific data destructor function to detect thread death. For
5730      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5731      is nothing.  Thus, the workaround is applicable only for Windows static
5732      stat library. */
5733   __kmp_internal_end_library(-1);
5734 #if KMP_OS_WINDOWS
5735   __kmp_close_console();
5736 #endif
5737 }
5738 
5739 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5740   // It is assumed __kmp_forkjoin_lock is acquired.
5741 
5742   int gtid;
5743 
5744   KMP_DEBUG_ASSERT(thread != NULL);
5745 
5746   gtid = thread->th.th_info.ds.ds_gtid;
5747 
5748   if (!is_root) {
5749 
5750     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5751       /* Assume the threads are at the fork barrier here */
5752       KA_TRACE(
5753           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5754                gtid));
5755       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5756        * (GEH) */
5757       ANNOTATE_HAPPENS_BEFORE(thread);
5758       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5759       __kmp_release_64(&flag);
5760     }
5761 
5762     // Terminate OS thread.
5763     __kmp_reap_worker(thread);
5764 
5765     // The thread was killed asynchronously.  If it was actively
5766     // spinning in the thread pool, decrement the global count.
5767     //
5768     // There is a small timing hole here - if the worker thread was just waking
5769     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5770     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5771     // the global counter might not get updated.
5772     //
5773     // Currently, this can only happen as the library is unloaded,
5774     // so there are no harmful side effects.
5775     if (thread->th.th_active_in_pool) {
5776       thread->th.th_active_in_pool = FALSE;
5777       KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
5778       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5779     }
5780 
5781     // Decrement # of [worker] threads in the pool.
5782     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5783     --__kmp_thread_pool_nth;
5784   }
5785 
5786   __kmp_free_implicit_task(thread);
5787 
5788 // Free the fast memory for tasking
5789 #if USE_FAST_MEMORY
5790   __kmp_free_fast_memory(thread);
5791 #endif /* USE_FAST_MEMORY */
5792 
5793   __kmp_suspend_uninitialize_thread(thread);
5794 
5795   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5796   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5797 
5798   --__kmp_all_nth;
5799 // __kmp_nth was decremented when thread is added to the pool.
5800 
5801 #ifdef KMP_ADJUST_BLOCKTIME
5802   /* Adjust blocktime back to user setting or default if necessary */
5803   /* Middle initialization might never have occurred                */
5804   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5805     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5806     if (__kmp_nth <= __kmp_avail_proc) {
5807       __kmp_zero_bt = FALSE;
5808     }
5809   }
5810 #endif /* KMP_ADJUST_BLOCKTIME */
5811 
5812   /* free the memory being used */
5813   if (__kmp_env_consistency_check) {
5814     if (thread->th.th_cons) {
5815       __kmp_free_cons_stack(thread->th.th_cons);
5816       thread->th.th_cons = NULL;
5817     }
5818   }
5819 
5820   if (thread->th.th_pri_common != NULL) {
5821     __kmp_free(thread->th.th_pri_common);
5822     thread->th.th_pri_common = NULL;
5823   }
5824 
5825   if (thread->th.th_task_state_memo_stack != NULL) {
5826     __kmp_free(thread->th.th_task_state_memo_stack);
5827     thread->th.th_task_state_memo_stack = NULL;
5828   }
5829 
5830 #if KMP_USE_BGET
5831   if (thread->th.th_local.bget_data != NULL) {
5832     __kmp_finalize_bget(thread);
5833   }
5834 #endif
5835 
5836 #if KMP_AFFINITY_SUPPORTED
5837   if (thread->th.th_affin_mask != NULL) {
5838     KMP_CPU_FREE(thread->th.th_affin_mask);
5839     thread->th.th_affin_mask = NULL;
5840   }
5841 #endif /* KMP_AFFINITY_SUPPORTED */
5842 
5843   __kmp_reap_team(thread->th.th_serial_team);
5844   thread->th.th_serial_team = NULL;
5845   __kmp_free(thread);
5846 
5847   KMP_MB();
5848 
5849 } // __kmp_reap_thread
5850 
5851 static void __kmp_internal_end(void) {
5852   int i;
5853 
5854   /* First, unregister the library */
5855   __kmp_unregister_library();
5856 
5857 #if KMP_OS_WINDOWS
5858   /* In Win static library, we can't tell when a root actually dies, so we
5859      reclaim the data structures for any root threads that have died but not
5860      unregistered themselves, in order to shut down cleanly.
5861      In Win dynamic library we also can't tell when a thread dies.  */
5862   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5863 // dead roots
5864 #endif
5865 
5866   for (i = 0; i < __kmp_threads_capacity; i++)
5867     if (__kmp_root[i])
5868       if (__kmp_root[i]->r.r_active)
5869         break;
5870   KMP_MB(); /* Flush all pending memory write invalidates.  */
5871   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5872 
5873   if (i < __kmp_threads_capacity) {
5874 #if KMP_USE_MONITOR
5875     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5876     KMP_MB(); /* Flush all pending memory write invalidates.  */
5877 
5878     // Need to check that monitor was initialized before reaping it. If we are
5879     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5880     // __kmp_monitor will appear to contain valid data, but it is only valid in
5881     // the parent process, not the child.
5882     // New behavior (201008): instead of keying off of the flag
5883     // __kmp_init_parallel, the monitor thread creation is keyed off
5884     // of the new flag __kmp_init_monitor.
5885     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5886     if (TCR_4(__kmp_init_monitor)) {
5887       __kmp_reap_monitor(&__kmp_monitor);
5888       TCW_4(__kmp_init_monitor, 0);
5889     }
5890     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5891     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5892 #endif // KMP_USE_MONITOR
5893   } else {
5894 /* TODO move this to cleanup code */
5895 #ifdef KMP_DEBUG
5896     /* make sure that everything has properly ended */
5897     for (i = 0; i < __kmp_threads_capacity; i++) {
5898       if (__kmp_root[i]) {
5899         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5900         //                    there can be uber threads alive here
5901         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5902       }
5903     }
5904 #endif
5905 
5906     KMP_MB();
5907 
5908     // Reap the worker threads.
5909     // This is valid for now, but be careful if threads are reaped sooner.
5910     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5911       // Get the next thread from the pool.
5912       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5913       __kmp_thread_pool = thread->th.th_next_pool;
5914       // Reap it.
5915       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5916       thread->th.th_next_pool = NULL;
5917       thread->th.th_in_pool = FALSE;
5918       __kmp_reap_thread(thread, 0);
5919     }
5920     __kmp_thread_pool_insert_pt = NULL;
5921 
5922     // Reap teams.
5923     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5924       // Get the next team from the pool.
5925       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5926       __kmp_team_pool = team->t.t_next_pool;
5927       // Reap it.
5928       team->t.t_next_pool = NULL;
5929       __kmp_reap_team(team);
5930     }
5931 
5932     __kmp_reap_task_teams();
5933 
5934     for (i = 0; i < __kmp_threads_capacity; ++i) {
5935       // TBD: Add some checking...
5936       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5937     }
5938 
5939     /* Make sure all threadprivate destructors get run by joining with all
5940        worker threads before resetting this flag */
5941     TCW_SYNC_4(__kmp_init_common, FALSE);
5942 
5943     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5944     KMP_MB();
5945 
5946 #if KMP_USE_MONITOR
5947     // See note above: One of the possible fixes for CQ138434 / CQ140126
5948     //
5949     // FIXME: push both code fragments down and CSE them?
5950     // push them into __kmp_cleanup() ?
5951     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5952     if (TCR_4(__kmp_init_monitor)) {
5953       __kmp_reap_monitor(&__kmp_monitor);
5954       TCW_4(__kmp_init_monitor, 0);
5955     }
5956     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5957     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5958 #endif
5959   } /* else !__kmp_global.t_active */
5960   TCW_4(__kmp_init_gtid, FALSE);
5961   KMP_MB(); /* Flush all pending memory write invalidates.  */
5962 
5963   __kmp_cleanup();
5964 #if OMPT_SUPPORT
5965   ompt_fini();
5966 #endif
5967 }
5968 
5969 void __kmp_internal_end_library(int gtid_req) {
5970   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5971   /* this shouldn't be a race condition because __kmp_internal_end() is the
5972      only place to clear __kmp_serial_init */
5973   /* we'll check this later too, after we get the lock */
5974   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5975   // redundaant, because the next check will work in any case.
5976   if (__kmp_global.g.g_abort) {
5977     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5978     /* TODO abort? */
5979     return;
5980   }
5981   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5982     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5983     return;
5984   }
5985 
5986   KMP_MB(); /* Flush all pending memory write invalidates.  */
5987 
5988   /* find out who we are and what we should do */
5989   {
5990     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
5991     KA_TRACE(
5992         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
5993     if (gtid == KMP_GTID_SHUTDOWN) {
5994       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
5995                     "already shutdown\n"));
5996       return;
5997     } else if (gtid == KMP_GTID_MONITOR) {
5998       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
5999                     "registered, or system shutdown\n"));
6000       return;
6001     } else if (gtid == KMP_GTID_DNE) {
6002       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6003                     "shutdown\n"));
6004       /* we don't know who we are, but we may still shutdown the library */
6005     } else if (KMP_UBER_GTID(gtid)) {
6006       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6007       if (__kmp_root[gtid]->r.r_active) {
6008         __kmp_global.g.g_abort = -1;
6009         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6010         KA_TRACE(10,
6011                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6012                   gtid));
6013         return;
6014       } else {
6015         KA_TRACE(
6016             10,
6017             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6018         __kmp_unregister_root_current_thread(gtid);
6019       }
6020     } else {
6021 /* worker threads may call this function through the atexit handler, if they
6022  * call exit() */
6023 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6024    TODO: do a thorough shutdown instead */
6025 #ifdef DUMP_DEBUG_ON_EXIT
6026       if (__kmp_debug_buf)
6027         __kmp_dump_debug_buffer();
6028 #endif
6029       return;
6030     }
6031   }
6032   /* synchronize the termination process */
6033   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6034 
6035   /* have we already finished */
6036   if (__kmp_global.g.g_abort) {
6037     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6038     /* TODO abort? */
6039     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6040     return;
6041   }
6042   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6043     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6044     return;
6045   }
6046 
6047   /* We need this lock to enforce mutex between this reading of
6048      __kmp_threads_capacity and the writing by __kmp_register_root.
6049      Alternatively, we can use a counter of roots that is atomically updated by
6050      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6051      __kmp_internal_end_*.  */
6052   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6053 
6054   /* now we can safely conduct the actual termination */
6055   __kmp_internal_end();
6056 
6057   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6058   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6059 
6060   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6061 
6062 #ifdef DUMP_DEBUG_ON_EXIT
6063   if (__kmp_debug_buf)
6064     __kmp_dump_debug_buffer();
6065 #endif
6066 
6067 #if KMP_OS_WINDOWS
6068   __kmp_close_console();
6069 #endif
6070 
6071   __kmp_fini_allocator();
6072 
6073 } // __kmp_internal_end_library
6074 
6075 void __kmp_internal_end_thread(int gtid_req) {
6076   int i;
6077 
6078   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6079   /* this shouldn't be a race condition because __kmp_internal_end() is the
6080    * only place to clear __kmp_serial_init */
6081   /* we'll check this later too, after we get the lock */
6082   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6083   // redundant, because the next check will work in any case.
6084   if (__kmp_global.g.g_abort) {
6085     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6086     /* TODO abort? */
6087     return;
6088   }
6089   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6090     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6091     return;
6092   }
6093 
6094   KMP_MB(); /* Flush all pending memory write invalidates.  */
6095 
6096   /* find out who we are and what we should do */
6097   {
6098     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6099     KA_TRACE(10,
6100              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6101     if (gtid == KMP_GTID_SHUTDOWN) {
6102       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6103                     "already shutdown\n"));
6104       return;
6105     } else if (gtid == KMP_GTID_MONITOR) {
6106       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6107                     "registered, or system shutdown\n"));
6108       return;
6109     } else if (gtid == KMP_GTID_DNE) {
6110       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6111                     "shutdown\n"));
6112       return;
6113       /* we don't know who we are */
6114     } else if (KMP_UBER_GTID(gtid)) {
6115       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6116       if (__kmp_root[gtid]->r.r_active) {
6117         __kmp_global.g.g_abort = -1;
6118         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6119         KA_TRACE(10,
6120                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6121                   gtid));
6122         return;
6123       } else {
6124         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6125                       gtid));
6126         __kmp_unregister_root_current_thread(gtid);
6127       }
6128     } else {
6129       /* just a worker thread, let's leave */
6130       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6131 
6132       if (gtid >= 0) {
6133         __kmp_threads[gtid]->th.th_task_team = NULL;
6134       }
6135 
6136       KA_TRACE(10,
6137                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6138                 gtid));
6139       return;
6140     }
6141   }
6142 #if defined KMP_DYNAMIC_LIB
6143   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6144   // thread, because we will better shutdown later in the library destructor.
6145   // The reason of this change is performance problem when non-openmp thread in
6146   // a loop forks and joins many openmp threads. We can save a lot of time
6147   // keeping worker threads alive until the program shutdown.
6148   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6149   // and Windows(DPD200287443) that occurs when using critical sections from
6150   // foreign threads.
6151   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6152   return;
6153 #endif
6154   /* synchronize the termination process */
6155   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6156 
6157   /* have we already finished */
6158   if (__kmp_global.g.g_abort) {
6159     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6160     /* TODO abort? */
6161     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6162     return;
6163   }
6164   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6165     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6166     return;
6167   }
6168 
6169   /* We need this lock to enforce mutex between this reading of
6170      __kmp_threads_capacity and the writing by __kmp_register_root.
6171      Alternatively, we can use a counter of roots that is atomically updated by
6172      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6173      __kmp_internal_end_*.  */
6174 
6175   /* should we finish the run-time?  are all siblings done? */
6176   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6177 
6178   for (i = 0; i < __kmp_threads_capacity; ++i) {
6179     if (KMP_UBER_GTID(i)) {
6180       KA_TRACE(
6181           10,
6182           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6183       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6184       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6185       return;
6186     }
6187   }
6188 
6189   /* now we can safely conduct the actual termination */
6190 
6191   __kmp_internal_end();
6192 
6193   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6194   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6195 
6196   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6197 
6198 #ifdef DUMP_DEBUG_ON_EXIT
6199   if (__kmp_debug_buf)
6200     __kmp_dump_debug_buffer();
6201 #endif
6202 } // __kmp_internal_end_thread
6203 
6204 // -----------------------------------------------------------------------------
6205 // Library registration stuff.
6206 
6207 static long __kmp_registration_flag = 0;
6208 // Random value used to indicate library initialization.
6209 static char *__kmp_registration_str = NULL;
6210 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6211 
6212 static inline char *__kmp_reg_status_name() {
6213   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6214      each thread. If registration and unregistration go in different threads
6215      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6216      env var can not be found, because the name will contain different pid. */
6217   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6218 } // __kmp_reg_status_get
6219 
6220 void __kmp_register_library_startup(void) {
6221 
6222   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6223   int done = 0;
6224   union {
6225     double dtime;
6226     long ltime;
6227   } time;
6228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6229   __kmp_initialize_system_tick();
6230 #endif
6231   __kmp_read_system_time(&time.dtime);
6232   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6233   __kmp_registration_str =
6234       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6235                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6236 
6237   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6238                 __kmp_registration_str));
6239 
6240   while (!done) {
6241 
6242     char *value = NULL; // Actual value of the environment variable.
6243 
6244     // Set environment variable, but do not overwrite if it is exist.
6245     __kmp_env_set(name, __kmp_registration_str, 0);
6246     // Check the variable is written.
6247     value = __kmp_env_get(name);
6248     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6249 
6250       done = 1; // Ok, environment variable set successfully, exit the loop.
6251 
6252     } else {
6253 
6254       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6255       // Check whether it alive or dead.
6256       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6257       char *tail = value;
6258       char *flag_addr_str = NULL;
6259       char *flag_val_str = NULL;
6260       char const *file_name = NULL;
6261       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6262       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6263       file_name = tail;
6264       if (tail != NULL) {
6265         long *flag_addr = 0;
6266         long flag_val = 0;
6267         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6268         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6269         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6270           // First, check whether environment-encoded address is mapped into
6271           // addr space.
6272           // If so, dereference it to see if it still has the right value.
6273           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6274             neighbor = 1;
6275           } else {
6276             // If not, then we know the other copy of the library is no longer
6277             // running.
6278             neighbor = 2;
6279           }
6280         }
6281       }
6282       switch (neighbor) {
6283       case 0: // Cannot parse environment variable -- neighbor status unknown.
6284         // Assume it is the incompatible format of future version of the
6285         // library. Assume the other library is alive.
6286         // WARN( ... ); // TODO: Issue a warning.
6287         file_name = "unknown library";
6288       // Attention! Falling to the next case. That's intentional.
6289       case 1: { // Neighbor is alive.
6290         // Check it is allowed.
6291         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6292         if (!__kmp_str_match_true(duplicate_ok)) {
6293           // That's not allowed. Issue fatal error.
6294           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6295                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6296         }
6297         KMP_INTERNAL_FREE(duplicate_ok);
6298         __kmp_duplicate_library_ok = 1;
6299         done = 1; // Exit the loop.
6300       } break;
6301       case 2: { // Neighbor is dead.
6302         // Clear the variable and try to register library again.
6303         __kmp_env_unset(name);
6304       } break;
6305       default: { KMP_DEBUG_ASSERT(0); } break;
6306       }
6307     }
6308     KMP_INTERNAL_FREE((void *)value);
6309   }
6310   KMP_INTERNAL_FREE((void *)name);
6311 
6312 } // func __kmp_register_library_startup
6313 
6314 void __kmp_unregister_library(void) {
6315 
6316   char *name = __kmp_reg_status_name();
6317   char *value = __kmp_env_get(name);
6318 
6319   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6320   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6321   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6322     // Ok, this is our variable. Delete it.
6323     __kmp_env_unset(name);
6324   }
6325 
6326   KMP_INTERNAL_FREE(__kmp_registration_str);
6327   KMP_INTERNAL_FREE(value);
6328   KMP_INTERNAL_FREE(name);
6329 
6330   __kmp_registration_flag = 0;
6331   __kmp_registration_str = NULL;
6332 
6333 } // __kmp_unregister_library
6334 
6335 // End of Library registration stuff.
6336 // -----------------------------------------------------------------------------
6337 
6338 #if KMP_MIC_SUPPORTED
6339 
6340 static void __kmp_check_mic_type() {
6341   kmp_cpuid_t cpuid_state = {0};
6342   kmp_cpuid_t *cs_p = &cpuid_state;
6343   __kmp_x86_cpuid(1, 0, cs_p);
6344   // We don't support mic1 at the moment
6345   if ((cs_p->eax & 0xff0) == 0xB10) {
6346     __kmp_mic_type = mic2;
6347   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6348     __kmp_mic_type = mic3;
6349   } else {
6350     __kmp_mic_type = non_mic;
6351   }
6352 }
6353 
6354 #endif /* KMP_MIC_SUPPORTED */
6355 
6356 static void __kmp_do_serial_initialize(void) {
6357   int i, gtid;
6358   int size;
6359 
6360   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6361 
6362   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6363   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6364   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6365   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6366   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6367 
6368 #if OMPT_SUPPORT
6369   ompt_pre_init();
6370 #endif
6371 
6372   __kmp_validate_locks();
6373 
6374   /* Initialize internal memory allocator */
6375   __kmp_init_allocator();
6376 
6377   /* Register the library startup via an environment variable and check to see
6378      whether another copy of the library is already registered. */
6379 
6380   __kmp_register_library_startup();
6381 
6382   /* TODO reinitialization of library */
6383   if (TCR_4(__kmp_global.g.g_done)) {
6384     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6385   }
6386 
6387   __kmp_global.g.g_abort = 0;
6388   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6389 
6390 /* initialize the locks */
6391 #if KMP_USE_ADAPTIVE_LOCKS
6392 #if KMP_DEBUG_ADAPTIVE_LOCKS
6393   __kmp_init_speculative_stats();
6394 #endif
6395 #endif
6396 #if KMP_STATS_ENABLED
6397   __kmp_stats_init();
6398 #endif
6399   __kmp_init_lock(&__kmp_global_lock);
6400   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6401   __kmp_init_lock(&__kmp_debug_lock);
6402   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6403   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6404   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6405   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6406   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6407   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6408   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6409   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6410   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6411   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6412   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6413   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6414   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6415   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6416   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6417 #if KMP_USE_MONITOR
6418   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6419 #endif
6420   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6421 
6422   /* conduct initialization and initial setup of configuration */
6423 
6424   __kmp_runtime_initialize();
6425 
6426 #if KMP_MIC_SUPPORTED
6427   __kmp_check_mic_type();
6428 #endif
6429 
6430 // Some global variable initialization moved here from kmp_env_initialize()
6431 #ifdef KMP_DEBUG
6432   kmp_diag = 0;
6433 #endif
6434   __kmp_abort_delay = 0;
6435 
6436   // From __kmp_init_dflt_team_nth()
6437   /* assume the entire machine will be used */
6438   __kmp_dflt_team_nth_ub = __kmp_xproc;
6439   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6440     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6441   }
6442   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6443     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6444   }
6445   __kmp_max_nth = __kmp_sys_max_nth;
6446   __kmp_cg_max_nth = __kmp_sys_max_nth;
6447   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6448   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6449     __kmp_teams_max_nth = __kmp_sys_max_nth;
6450   }
6451 
6452   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6453   // part
6454   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6455 #if KMP_USE_MONITOR
6456   __kmp_monitor_wakeups =
6457       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6458   __kmp_bt_intervals =
6459       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6460 #endif
6461   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6462   __kmp_library = library_throughput;
6463   // From KMP_SCHEDULE initialization
6464   __kmp_static = kmp_sch_static_balanced;
6465 // AC: do not use analytical here, because it is non-monotonous
6466 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6467 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6468 // need to repeat assignment
6469 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6470 // bit control and barrier method control parts
6471 #if KMP_FAST_REDUCTION_BARRIER
6472 #define kmp_reduction_barrier_gather_bb ((int)1)
6473 #define kmp_reduction_barrier_release_bb ((int)1)
6474 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6475 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6476 #endif // KMP_FAST_REDUCTION_BARRIER
6477   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6478     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6479     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6480     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6481     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6482 #if KMP_FAST_REDUCTION_BARRIER
6483     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6484       // lin_64 ): hyper,1
6485       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6486       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6487       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6488       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6489     }
6490 #endif // KMP_FAST_REDUCTION_BARRIER
6491   }
6492 #if KMP_FAST_REDUCTION_BARRIER
6493 #undef kmp_reduction_barrier_release_pat
6494 #undef kmp_reduction_barrier_gather_pat
6495 #undef kmp_reduction_barrier_release_bb
6496 #undef kmp_reduction_barrier_gather_bb
6497 #endif // KMP_FAST_REDUCTION_BARRIER
6498 #if KMP_MIC_SUPPORTED
6499   if (__kmp_mic_type == mic2) { // KNC
6500     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6501     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6502     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6503         1; // forkjoin release
6504     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6505     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6506   }
6507 #if KMP_FAST_REDUCTION_BARRIER
6508   if (__kmp_mic_type == mic2) { // KNC
6509     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6510     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6511   }
6512 #endif // KMP_FAST_REDUCTION_BARRIER
6513 #endif // KMP_MIC_SUPPORTED
6514 
6515 // From KMP_CHECKS initialization
6516 #ifdef KMP_DEBUG
6517   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6518 #else
6519   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6520 #endif
6521 
6522   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6523   __kmp_foreign_tp = TRUE;
6524 
6525   __kmp_global.g.g_dynamic = FALSE;
6526   __kmp_global.g.g_dynamic_mode = dynamic_default;
6527 
6528   __kmp_env_initialize(NULL);
6529 
6530 // Print all messages in message catalog for testing purposes.
6531 #ifdef KMP_DEBUG
6532   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6533   if (__kmp_str_match_true(val)) {
6534     kmp_str_buf_t buffer;
6535     __kmp_str_buf_init(&buffer);
6536     __kmp_i18n_dump_catalog(&buffer);
6537     __kmp_printf("%s", buffer.str);
6538     __kmp_str_buf_free(&buffer);
6539   }
6540   __kmp_env_free(&val);
6541 #endif
6542 
6543   __kmp_threads_capacity =
6544       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6545   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6546   __kmp_tp_capacity = __kmp_default_tp_capacity(
6547       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6548 
6549   // If the library is shut down properly, both pools must be NULL. Just in
6550   // case, set them to NULL -- some memory may leak, but subsequent code will
6551   // work even if pools are not freed.
6552   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6553   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6554   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6555   __kmp_thread_pool = NULL;
6556   __kmp_thread_pool_insert_pt = NULL;
6557   __kmp_team_pool = NULL;
6558 
6559   /* Allocate all of the variable sized records */
6560   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6561    * expandable */
6562   /* Since allocation is cache-aligned, just add extra padding at the end */
6563   size =
6564       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6565       CACHE_LINE;
6566   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6567   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6568                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6569 
6570   /* init thread counts */
6571   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6572                    0); // Asserts fail if the library is reinitializing and
6573   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6574   __kmp_all_nth = 0;
6575   __kmp_nth = 0;
6576 
6577   /* setup the uber master thread and hierarchy */
6578   gtid = __kmp_register_root(TRUE);
6579   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6580   KMP_ASSERT(KMP_UBER_GTID(gtid));
6581   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6582 
6583   KMP_MB(); /* Flush all pending memory write invalidates.  */
6584 
6585   __kmp_common_initialize();
6586 
6587 #if KMP_OS_UNIX
6588   /* invoke the child fork handler */
6589   __kmp_register_atfork();
6590 #endif
6591 
6592 #if !defined KMP_DYNAMIC_LIB
6593   {
6594     /* Invoke the exit handler when the program finishes, only for static
6595        library. For dynamic library, we already have _fini and DllMain. */
6596     int rc = atexit(__kmp_internal_end_atexit);
6597     if (rc != 0) {
6598       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6599                   __kmp_msg_null);
6600     }
6601   }
6602 #endif
6603 
6604 #if KMP_HANDLE_SIGNALS
6605 #if KMP_OS_UNIX
6606   /* NOTE: make sure that this is called before the user installs their own
6607      signal handlers so that the user handlers are called first. this way they
6608      can return false, not call our handler, avoid terminating the library, and
6609      continue execution where they left off. */
6610   __kmp_install_signals(FALSE);
6611 #endif /* KMP_OS_UNIX */
6612 #if KMP_OS_WINDOWS
6613   __kmp_install_signals(TRUE);
6614 #endif /* KMP_OS_WINDOWS */
6615 #endif
6616 
6617   /* we have finished the serial initialization */
6618   __kmp_init_counter++;
6619 
6620   __kmp_init_serial = TRUE;
6621 
6622   if (__kmp_settings) {
6623     __kmp_env_print();
6624   }
6625 
6626 #if OMP_40_ENABLED
6627   if (__kmp_display_env || __kmp_display_env_verbose) {
6628     __kmp_env_print_2();
6629   }
6630 #endif // OMP_40_ENABLED
6631 
6632 #if OMPT_SUPPORT
6633   ompt_post_init();
6634 #endif
6635 
6636   KMP_MB();
6637 
6638   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6639 }
6640 
6641 void __kmp_serial_initialize(void) {
6642   if (__kmp_init_serial) {
6643     return;
6644   }
6645   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6646   if (__kmp_init_serial) {
6647     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6648     return;
6649   }
6650   __kmp_do_serial_initialize();
6651   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6652 }
6653 
6654 static void __kmp_do_middle_initialize(void) {
6655   int i, j;
6656   int prev_dflt_team_nth;
6657 
6658   if (!__kmp_init_serial) {
6659     __kmp_do_serial_initialize();
6660   }
6661 
6662   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6663 
6664   // Save the previous value for the __kmp_dflt_team_nth so that
6665   // we can avoid some reinitialization if it hasn't changed.
6666   prev_dflt_team_nth = __kmp_dflt_team_nth;
6667 
6668 #if KMP_AFFINITY_SUPPORTED
6669   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6670   // number of cores on the machine.
6671   __kmp_affinity_initialize();
6672 
6673   // Run through the __kmp_threads array and set the affinity mask
6674   // for each root thread that is currently registered with the RTL.
6675   for (i = 0; i < __kmp_threads_capacity; i++) {
6676     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6677       __kmp_affinity_set_init_mask(i, TRUE);
6678     }
6679   }
6680 #endif /* KMP_AFFINITY_SUPPORTED */
6681 
6682   KMP_ASSERT(__kmp_xproc > 0);
6683   if (__kmp_avail_proc == 0) {
6684     __kmp_avail_proc = __kmp_xproc;
6685   }
6686 
6687   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6688   // correct them now
6689   j = 0;
6690   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6691     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6692         __kmp_avail_proc;
6693     j++;
6694   }
6695 
6696   if (__kmp_dflt_team_nth == 0) {
6697 #ifdef KMP_DFLT_NTH_CORES
6698     // Default #threads = #cores
6699     __kmp_dflt_team_nth = __kmp_ncores;
6700     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6701                   "__kmp_ncores (%d)\n",
6702                   __kmp_dflt_team_nth));
6703 #else
6704     // Default #threads = #available OS procs
6705     __kmp_dflt_team_nth = __kmp_avail_proc;
6706     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6707                   "__kmp_avail_proc(%d)\n",
6708                   __kmp_dflt_team_nth));
6709 #endif /* KMP_DFLT_NTH_CORES */
6710   }
6711 
6712   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6713     __kmp_dflt_team_nth = KMP_MIN_NTH;
6714   }
6715   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6716     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6717   }
6718 
6719   // There's no harm in continuing if the following check fails,
6720   // but it indicates an error in the previous logic.
6721   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6722 
6723   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6724     // Run through the __kmp_threads array and set the num threads icv for each
6725     // root thread that is currently registered with the RTL (which has not
6726     // already explicitly set its nthreads-var with a call to
6727     // omp_set_num_threads()).
6728     for (i = 0; i < __kmp_threads_capacity; i++) {
6729       kmp_info_t *thread = __kmp_threads[i];
6730       if (thread == NULL)
6731         continue;
6732       if (thread->th.th_current_task->td_icvs.nproc != 0)
6733         continue;
6734 
6735       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6736     }
6737   }
6738   KA_TRACE(
6739       20,
6740       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6741        __kmp_dflt_team_nth));
6742 
6743 #ifdef KMP_ADJUST_BLOCKTIME
6744   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6745   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6746     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6747     if (__kmp_nth > __kmp_avail_proc) {
6748       __kmp_zero_bt = TRUE;
6749     }
6750   }
6751 #endif /* KMP_ADJUST_BLOCKTIME */
6752 
6753   /* we have finished middle initialization */
6754   TCW_SYNC_4(__kmp_init_middle, TRUE);
6755 
6756   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6757 }
6758 
6759 void __kmp_middle_initialize(void) {
6760   if (__kmp_init_middle) {
6761     return;
6762   }
6763   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6764   if (__kmp_init_middle) {
6765     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6766     return;
6767   }
6768   __kmp_do_middle_initialize();
6769   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6770 }
6771 
6772 void __kmp_parallel_initialize(void) {
6773   int gtid = __kmp_entry_gtid(); // this might be a new root
6774 
6775   /* synchronize parallel initialization (for sibling) */
6776   if (TCR_4(__kmp_init_parallel))
6777     return;
6778   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6779   if (TCR_4(__kmp_init_parallel)) {
6780     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6781     return;
6782   }
6783 
6784   /* TODO reinitialization after we have already shut down */
6785   if (TCR_4(__kmp_global.g.g_done)) {
6786     KA_TRACE(
6787         10,
6788         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6789     __kmp_infinite_loop();
6790   }
6791 
6792   /* jc: The lock __kmp_initz_lock is already held, so calling
6793      __kmp_serial_initialize would cause a deadlock.  So we call
6794      __kmp_do_serial_initialize directly. */
6795   if (!__kmp_init_middle) {
6796     __kmp_do_middle_initialize();
6797   }
6798 
6799   /* begin initialization */
6800   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6801   KMP_ASSERT(KMP_UBER_GTID(gtid));
6802 
6803 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6804   // Save the FP control regs.
6805   // Worker threads will set theirs to these values at thread startup.
6806   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6807   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6808   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6809 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6810 
6811 #if KMP_OS_UNIX
6812 #if KMP_HANDLE_SIGNALS
6813   /*  must be after __kmp_serial_initialize  */
6814   __kmp_install_signals(TRUE);
6815 #endif
6816 #endif
6817 
6818   __kmp_suspend_initialize();
6819 
6820 #if defined(USE_LOAD_BALANCE)
6821   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6822     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6823   }
6824 #else
6825   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6826     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6827   }
6828 #endif
6829 
6830   if (__kmp_version) {
6831     __kmp_print_version_2();
6832   }
6833 
6834   /* we have finished parallel initialization */
6835   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6836 
6837   KMP_MB();
6838   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6839 
6840   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6841 }
6842 
6843 /* ------------------------------------------------------------------------ */
6844 
6845 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6846                                    kmp_team_t *team) {
6847   kmp_disp_t *dispatch;
6848 
6849   KMP_MB();
6850 
6851   /* none of the threads have encountered any constructs, yet. */
6852   this_thr->th.th_local.this_construct = 0;
6853 #if KMP_CACHE_MANAGE
6854   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6855 #endif /* KMP_CACHE_MANAGE */
6856   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6857   KMP_DEBUG_ASSERT(dispatch);
6858   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6859   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6860   // this_thr->th.th_info.ds.ds_tid ] );
6861 
6862   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6863 #if OMP_45_ENABLED
6864   dispatch->th_doacross_buf_idx =
6865       0; /* reset the doacross dispatch buffer counter */
6866 #endif
6867   if (__kmp_env_consistency_check)
6868     __kmp_push_parallel(gtid, team->t.t_ident);
6869 
6870   KMP_MB(); /* Flush all pending memory write invalidates.  */
6871 }
6872 
6873 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6874                                   kmp_team_t *team) {
6875   if (__kmp_env_consistency_check)
6876     __kmp_pop_parallel(gtid, team->t.t_ident);
6877 
6878   __kmp_finish_implicit_task(this_thr);
6879 }
6880 
6881 int __kmp_invoke_task_func(int gtid) {
6882   int rc;
6883   int tid = __kmp_tid_from_gtid(gtid);
6884   kmp_info_t *this_thr = __kmp_threads[gtid];
6885   kmp_team_t *team = this_thr->th.th_team;
6886 
6887   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6888 #if USE_ITT_BUILD
6889   if (__itt_stack_caller_create_ptr) {
6890     __kmp_itt_stack_callee_enter(
6891         (__itt_caller)
6892             team->t.t_stack_id); // inform ittnotify about entering user's code
6893   }
6894 #endif /* USE_ITT_BUILD */
6895 #if INCLUDE_SSC_MARKS
6896   SSC_MARK_INVOKING();
6897 #endif
6898 
6899 #if OMPT_SUPPORT
6900   void *dummy;
6901   void **exit_runtime_p;
6902   ompt_data_t *my_task_data;
6903   ompt_data_t *my_parallel_data;
6904   int ompt_team_size;
6905 
6906   if (ompt_enabled.enabled) {
6907     exit_runtime_p = &(
6908         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame);
6909   } else {
6910     exit_runtime_p = &dummy;
6911   }
6912 
6913   my_task_data =
6914       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6915   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6916   if (ompt_enabled.ompt_callback_implicit_task) {
6917     ompt_team_size = team->t.t_nproc;
6918     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6919         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6920         __kmp_tid_from_gtid(gtid));
6921   }
6922 #endif
6923 
6924   {
6925     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6926     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6927     rc =
6928         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6929                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6930 #if OMPT_SUPPORT
6931                                ,
6932                                exit_runtime_p
6933 #endif
6934                                );
6935 #if OMPT_SUPPORT
6936     *exit_runtime_p = NULL;
6937 #endif
6938   }
6939 
6940 #if USE_ITT_BUILD
6941   if (__itt_stack_caller_create_ptr) {
6942     __kmp_itt_stack_callee_leave(
6943         (__itt_caller)
6944             team->t.t_stack_id); // inform ittnotify about leaving user's code
6945   }
6946 #endif /* USE_ITT_BUILD */
6947   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6948 
6949   return rc;
6950 }
6951 
6952 #if OMP_40_ENABLED
6953 void __kmp_teams_master(int gtid) {
6954   // This routine is called by all master threads in teams construct
6955   kmp_info_t *thr = __kmp_threads[gtid];
6956   kmp_team_t *team = thr->th.th_team;
6957   ident_t *loc = team->t.t_ident;
6958   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6959   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6960   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6961   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6962                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6963 // Launch league of teams now, but not let workers execute
6964 // (they hang on fork barrier until next parallel)
6965 #if INCLUDE_SSC_MARKS
6966   SSC_MARK_FORKING();
6967 #endif
6968   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6969                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6970                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6971 #if INCLUDE_SSC_MARKS
6972   SSC_MARK_JOINING();
6973 #endif
6974 
6975   // AC: last parameter "1" eliminates join barrier which won't work because
6976   // worker threads are in a fork barrier waiting for more parallel regions
6977   __kmp_join_call(loc, gtid
6978 #if OMPT_SUPPORT
6979                   ,
6980                   fork_context_intel
6981 #endif
6982                   ,
6983                   1);
6984 }
6985 
6986 int __kmp_invoke_teams_master(int gtid) {
6987   kmp_info_t *this_thr = __kmp_threads[gtid];
6988   kmp_team_t *team = this_thr->th.th_team;
6989 #if KMP_DEBUG
6990   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
6991     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
6992                      (void *)__kmp_teams_master);
6993 #endif
6994   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
6995   __kmp_teams_master(gtid);
6996   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
6997   return 1;
6998 }
6999 #endif /* OMP_40_ENABLED */
7000 
7001 /* this sets the requested number of threads for the next parallel region
7002    encountered by this team. since this should be enclosed in the forkjoin
7003    critical section it should avoid race conditions with assymmetrical nested
7004    parallelism */
7005 
7006 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7007   kmp_info_t *thr = __kmp_threads[gtid];
7008 
7009   if (num_threads > 0)
7010     thr->th.th_set_nproc = num_threads;
7011 }
7012 
7013 #if OMP_40_ENABLED
7014 
7015 /* this sets the requested number of teams for the teams region and/or
7016    the number of threads for the next parallel region encountered  */
7017 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7018                           int num_threads) {
7019   kmp_info_t *thr = __kmp_threads[gtid];
7020   KMP_DEBUG_ASSERT(num_teams >= 0);
7021   KMP_DEBUG_ASSERT(num_threads >= 0);
7022 
7023   if (num_teams == 0)
7024     num_teams = 1; // default number of teams is 1.
7025   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7026     if (!__kmp_reserve_warn) {
7027       __kmp_reserve_warn = 1;
7028       __kmp_msg(kmp_ms_warning,
7029                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7030                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7031     }
7032     num_teams = __kmp_teams_max_nth;
7033   }
7034   // Set number of teams (number of threads in the outer "parallel" of the
7035   // teams)
7036   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7037 
7038   // Remember the number of threads for inner parallel regions
7039   if (num_threads == 0) {
7040     if (!TCR_4(__kmp_init_middle))
7041       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7042     num_threads = __kmp_avail_proc / num_teams;
7043     if (num_teams * num_threads > __kmp_teams_max_nth) {
7044       // adjust num_threads w/o warning as it is not user setting
7045       num_threads = __kmp_teams_max_nth / num_teams;
7046     }
7047   } else {
7048     if (num_teams * num_threads > __kmp_teams_max_nth) {
7049       int new_threads = __kmp_teams_max_nth / num_teams;
7050       if (!__kmp_reserve_warn) { // user asked for too many threads
7051         __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
7052         __kmp_msg(kmp_ms_warning,
7053                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7054                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7055       }
7056       num_threads = new_threads;
7057     }
7058   }
7059   thr->th.th_teams_size.nth = num_threads;
7060 }
7061 
7062 // Set the proc_bind var to use in the following parallel region.
7063 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7064   kmp_info_t *thr = __kmp_threads[gtid];
7065   thr->th.th_set_proc_bind = proc_bind;
7066 }
7067 
7068 #endif /* OMP_40_ENABLED */
7069 
7070 /* Launch the worker threads into the microtask. */
7071 
7072 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7073   kmp_info_t *this_thr = __kmp_threads[gtid];
7074 
7075 #ifdef KMP_DEBUG
7076   int f;
7077 #endif /* KMP_DEBUG */
7078 
7079   KMP_DEBUG_ASSERT(team);
7080   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7081   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7082   KMP_MB(); /* Flush all pending memory write invalidates.  */
7083 
7084   team->t.t_construct = 0; /* no single directives seen yet */
7085   team->t.t_ordered.dt.t_value =
7086       0; /* thread 0 enters the ordered section first */
7087 
7088   /* Reset the identifiers on the dispatch buffer */
7089   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7090   if (team->t.t_max_nproc > 1) {
7091     int i;
7092     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7093       team->t.t_disp_buffer[i].buffer_index = i;
7094 #if OMP_45_ENABLED
7095       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7096 #endif
7097     }
7098   } else {
7099     team->t.t_disp_buffer[0].buffer_index = 0;
7100 #if OMP_45_ENABLED
7101     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7102 #endif
7103   }
7104 
7105   KMP_MB(); /* Flush all pending memory write invalidates.  */
7106   KMP_ASSERT(this_thr->th.th_team == team);
7107 
7108 #ifdef KMP_DEBUG
7109   for (f = 0; f < team->t.t_nproc; f++) {
7110     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7111                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7112   }
7113 #endif /* KMP_DEBUG */
7114 
7115   /* release the worker threads so they may begin working */
7116   __kmp_fork_barrier(gtid, 0);
7117 }
7118 
7119 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7120   kmp_info_t *this_thr = __kmp_threads[gtid];
7121 
7122   KMP_DEBUG_ASSERT(team);
7123   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7124   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7125   KMP_MB(); /* Flush all pending memory write invalidates.  */
7126 
7127 /* Join barrier after fork */
7128 
7129 #ifdef KMP_DEBUG
7130   if (__kmp_threads[gtid] &&
7131       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7132     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7133                  __kmp_threads[gtid]);
7134     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7135                  "team->t.t_nproc=%d\n",
7136                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7137                  team->t.t_nproc);
7138     __kmp_print_structure();
7139   }
7140   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7141                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7142 #endif /* KMP_DEBUG */
7143 
7144   __kmp_join_barrier(gtid); /* wait for everyone */
7145 #if OMPT_SUPPORT
7146   if (ompt_enabled.enabled &&
7147       this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) {
7148     int ds_tid = this_thr->th.th_info.ds.ds_tid;
7149     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7150     this_thr->th.ompt_thread_info.state = omp_state_overhead;
7151 #if OMPT_OPTIONAL
7152     void *codeptr = NULL;
7153     if (KMP_MASTER_TID(ds_tid) &&
7154         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7155          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7156       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7157 
7158     if (ompt_enabled.ompt_callback_sync_region_wait) {
7159       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7160           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7161     }
7162     if (ompt_enabled.ompt_callback_sync_region) {
7163       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7164           ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7165     }
7166 #endif
7167     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7168       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7169           ompt_scope_end, NULL, task_data, 0, ds_tid);
7170     }
7171   }
7172 #endif
7173 
7174   KMP_MB(); /* Flush all pending memory write invalidates.  */
7175   KMP_ASSERT(this_thr->th.th_team == team);
7176 }
7177 
7178 /* ------------------------------------------------------------------------ */
7179 
7180 #ifdef USE_LOAD_BALANCE
7181 
7182 // Return the worker threads actively spinning in the hot team, if we
7183 // are at the outermost level of parallelism.  Otherwise, return 0.
7184 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7185   int i;
7186   int retval;
7187   kmp_team_t *hot_team;
7188 
7189   if (root->r.r_active) {
7190     return 0;
7191   }
7192   hot_team = root->r.r_hot_team;
7193   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7194     return hot_team->t.t_nproc - 1; // Don't count master thread
7195   }
7196 
7197   // Skip the master thread - it is accounted for elsewhere.
7198   retval = 0;
7199   for (i = 1; i < hot_team->t.t_nproc; i++) {
7200     if (hot_team->t.t_threads[i]->th.th_active) {
7201       retval++;
7202     }
7203   }
7204   return retval;
7205 }
7206 
7207 // Perform an automatic adjustment to the number of
7208 // threads used by the next parallel region.
7209 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7210   int retval;
7211   int pool_active;
7212   int hot_team_active;
7213   int team_curr_active;
7214   int system_active;
7215 
7216   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7217                 set_nproc));
7218   KMP_DEBUG_ASSERT(root);
7219   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7220                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7221   KMP_DEBUG_ASSERT(set_nproc > 1);
7222 
7223   if (set_nproc == 1) {
7224     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7225     return 1;
7226   }
7227 
7228   // Threads that are active in the thread pool, active in the hot team for this
7229   // particular root (if we are at the outer par level), and the currently
7230   // executing thread (to become the master) are available to add to the new
7231   // team, but are currently contributing to the system load, and must be
7232   // accounted for.
7233   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7234   hot_team_active = __kmp_active_hot_team_nproc(root);
7235   team_curr_active = pool_active + hot_team_active + 1;
7236 
7237   // Check the system load.
7238   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7239   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7240                 "hot team active = %d\n",
7241                 system_active, pool_active, hot_team_active));
7242 
7243   if (system_active < 0) {
7244     // There was an error reading the necessary info from /proc, so use the
7245     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7246     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7247     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7248     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7249 
7250     // Make this call behave like the thread limit algorithm.
7251     retval = __kmp_avail_proc - __kmp_nth +
7252              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7253     if (retval > set_nproc) {
7254       retval = set_nproc;
7255     }
7256     if (retval < KMP_MIN_NTH) {
7257       retval = KMP_MIN_NTH;
7258     }
7259 
7260     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7261                   retval));
7262     return retval;
7263   }
7264 
7265   // There is a slight delay in the load balance algorithm in detecting new
7266   // running procs. The real system load at this instant should be at least as
7267   // large as the #active omp thread that are available to add to the team.
7268   if (system_active < team_curr_active) {
7269     system_active = team_curr_active;
7270   }
7271   retval = __kmp_avail_proc - system_active + team_curr_active;
7272   if (retval > set_nproc) {
7273     retval = set_nproc;
7274   }
7275   if (retval < KMP_MIN_NTH) {
7276     retval = KMP_MIN_NTH;
7277   }
7278 
7279   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7280   return retval;
7281 } // __kmp_load_balance_nproc()
7282 
7283 #endif /* USE_LOAD_BALANCE */
7284 
7285 /* ------------------------------------------------------------------------ */
7286 
7287 /* NOTE: this is called with the __kmp_init_lock held */
7288 void __kmp_cleanup(void) {
7289   int f;
7290 
7291   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7292 
7293   if (TCR_4(__kmp_init_parallel)) {
7294 #if KMP_HANDLE_SIGNALS
7295     __kmp_remove_signals();
7296 #endif
7297     TCW_4(__kmp_init_parallel, FALSE);
7298   }
7299 
7300   if (TCR_4(__kmp_init_middle)) {
7301 #if KMP_AFFINITY_SUPPORTED
7302     __kmp_affinity_uninitialize();
7303 #endif /* KMP_AFFINITY_SUPPORTED */
7304     __kmp_cleanup_hierarchy();
7305     TCW_4(__kmp_init_middle, FALSE);
7306   }
7307 
7308   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7309 
7310   if (__kmp_init_serial) {
7311     __kmp_runtime_destroy();
7312     __kmp_init_serial = FALSE;
7313   }
7314 
7315   __kmp_cleanup_threadprivate_caches();
7316 
7317   for (f = 0; f < __kmp_threads_capacity; f++) {
7318     if (__kmp_root[f] != NULL) {
7319       __kmp_free(__kmp_root[f]);
7320       __kmp_root[f] = NULL;
7321     }
7322   }
7323   __kmp_free(__kmp_threads);
7324   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7325   // there is no need in freeing __kmp_root.
7326   __kmp_threads = NULL;
7327   __kmp_root = NULL;
7328   __kmp_threads_capacity = 0;
7329 
7330 #if KMP_USE_DYNAMIC_LOCK
7331   __kmp_cleanup_indirect_user_locks();
7332 #else
7333   __kmp_cleanup_user_locks();
7334 #endif
7335 
7336 #if KMP_AFFINITY_SUPPORTED
7337   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7338   __kmp_cpuinfo_file = NULL;
7339 #endif /* KMP_AFFINITY_SUPPORTED */
7340 
7341 #if KMP_USE_ADAPTIVE_LOCKS
7342 #if KMP_DEBUG_ADAPTIVE_LOCKS
7343   __kmp_print_speculative_stats();
7344 #endif
7345 #endif
7346   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7347   __kmp_nested_nth.nth = NULL;
7348   __kmp_nested_nth.size = 0;
7349   __kmp_nested_nth.used = 0;
7350   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7351   __kmp_nested_proc_bind.bind_types = NULL;
7352   __kmp_nested_proc_bind.size = 0;
7353   __kmp_nested_proc_bind.used = 0;
7354 
7355   __kmp_i18n_catclose();
7356 
7357 #if KMP_STATS_ENABLED
7358   __kmp_stats_fini();
7359 #endif
7360 
7361   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7362 }
7363 
7364 /* ------------------------------------------------------------------------ */
7365 
7366 int __kmp_ignore_mppbeg(void) {
7367   char *env;
7368 
7369   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7370     if (__kmp_str_match_false(env))
7371       return FALSE;
7372   }
7373   // By default __kmpc_begin() is no-op.
7374   return TRUE;
7375 }
7376 
7377 int __kmp_ignore_mppend(void) {
7378   char *env;
7379 
7380   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7381     if (__kmp_str_match_false(env))
7382       return FALSE;
7383   }
7384   // By default __kmpc_end() is no-op.
7385   return TRUE;
7386 }
7387 
7388 void __kmp_internal_begin(void) {
7389   int gtid;
7390   kmp_root_t *root;
7391 
7392   /* this is a very important step as it will register new sibling threads
7393      and assign these new uber threads a new gtid */
7394   gtid = __kmp_entry_gtid();
7395   root = __kmp_threads[gtid]->th.th_root;
7396   KMP_ASSERT(KMP_UBER_GTID(gtid));
7397 
7398   if (root->r.r_begin)
7399     return;
7400   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7401   if (root->r.r_begin) {
7402     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7403     return;
7404   }
7405 
7406   root->r.r_begin = TRUE;
7407 
7408   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7409 }
7410 
7411 /* ------------------------------------------------------------------------ */
7412 
7413 void __kmp_user_set_library(enum library_type arg) {
7414   int gtid;
7415   kmp_root_t *root;
7416   kmp_info_t *thread;
7417 
7418   /* first, make sure we are initialized so we can get our gtid */
7419 
7420   gtid = __kmp_entry_gtid();
7421   thread = __kmp_threads[gtid];
7422 
7423   root = thread->th.th_root;
7424 
7425   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7426                 library_serial));
7427   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7428                                   thread */
7429     KMP_WARNING(SetLibraryIncorrectCall);
7430     return;
7431   }
7432 
7433   switch (arg) {
7434   case library_serial:
7435     thread->th.th_set_nproc = 0;
7436     set__nproc(thread, 1);
7437     break;
7438   case library_turnaround:
7439     thread->th.th_set_nproc = 0;
7440     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7441                                            : __kmp_dflt_team_nth_ub);
7442     break;
7443   case library_throughput:
7444     thread->th.th_set_nproc = 0;
7445     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7446                                            : __kmp_dflt_team_nth_ub);
7447     break;
7448   default:
7449     KMP_FATAL(UnknownLibraryType, arg);
7450   }
7451 
7452   __kmp_aux_set_library(arg);
7453 }
7454 
7455 void __kmp_aux_set_stacksize(size_t arg) {
7456   if (!__kmp_init_serial)
7457     __kmp_serial_initialize();
7458 
7459 #if KMP_OS_DARWIN
7460   if (arg & (0x1000 - 1)) {
7461     arg &= ~(0x1000 - 1);
7462     if (arg + 0x1000) /* check for overflow if we round up */
7463       arg += 0x1000;
7464   }
7465 #endif
7466   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7467 
7468   /* only change the default stacksize before the first parallel region */
7469   if (!TCR_4(__kmp_init_parallel)) {
7470     size_t value = arg; /* argument is in bytes */
7471 
7472     if (value < __kmp_sys_min_stksize)
7473       value = __kmp_sys_min_stksize;
7474     else if (value > KMP_MAX_STKSIZE)
7475       value = KMP_MAX_STKSIZE;
7476 
7477     __kmp_stksize = value;
7478 
7479     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7480   }
7481 
7482   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7483 }
7484 
7485 /* set the behaviour of the runtime library */
7486 /* TODO this can cause some odd behaviour with sibling parallelism... */
7487 void __kmp_aux_set_library(enum library_type arg) {
7488   __kmp_library = arg;
7489 
7490   switch (__kmp_library) {
7491   case library_serial: {
7492     KMP_INFORM(LibraryIsSerial);
7493     (void)__kmp_change_library(TRUE);
7494   } break;
7495   case library_turnaround:
7496     (void)__kmp_change_library(TRUE);
7497     break;
7498   case library_throughput:
7499     (void)__kmp_change_library(FALSE);
7500     break;
7501   default:
7502     KMP_FATAL(UnknownLibraryType, arg);
7503   }
7504 }
7505 
7506 /* ------------------------------------------------------------------------ */
7507 
7508 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7509   int blocktime = arg; /* argument is in milliseconds */
7510 #if KMP_USE_MONITOR
7511   int bt_intervals;
7512 #endif
7513   int bt_set;
7514 
7515   __kmp_save_internal_controls(thread);
7516 
7517   /* Normalize and set blocktime for the teams */
7518   if (blocktime < KMP_MIN_BLOCKTIME)
7519     blocktime = KMP_MIN_BLOCKTIME;
7520   else if (blocktime > KMP_MAX_BLOCKTIME)
7521     blocktime = KMP_MAX_BLOCKTIME;
7522 
7523   set__blocktime_team(thread->th.th_team, tid, blocktime);
7524   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7525 
7526 #if KMP_USE_MONITOR
7527   /* Calculate and set blocktime intervals for the teams */
7528   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7529 
7530   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7531   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7532 #endif
7533 
7534   /* Set whether blocktime has been set to "TRUE" */
7535   bt_set = TRUE;
7536 
7537   set__bt_set_team(thread->th.th_team, tid, bt_set);
7538   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7539 #if KMP_USE_MONITOR
7540   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7541                 "bt_intervals=%d, monitor_updates=%d\n",
7542                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7543                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7544                 __kmp_monitor_wakeups));
7545 #else
7546   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7547                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7548                 thread->th.th_team->t.t_id, tid, blocktime));
7549 #endif
7550 }
7551 
7552 void __kmp_aux_set_defaults(char const *str, int len) {
7553   if (!__kmp_init_serial) {
7554     __kmp_serial_initialize();
7555   }
7556   __kmp_env_initialize(str);
7557 
7558   if (__kmp_settings
7559 #if OMP_40_ENABLED
7560       || __kmp_display_env || __kmp_display_env_verbose
7561 #endif // OMP_40_ENABLED
7562       ) {
7563     __kmp_env_print();
7564   }
7565 } // __kmp_aux_set_defaults
7566 
7567 /* ------------------------------------------------------------------------ */
7568 /* internal fast reduction routines */
7569 
7570 PACKED_REDUCTION_METHOD_T
7571 __kmp_determine_reduction_method(
7572     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7573     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7574     kmp_critical_name *lck) {
7575 
7576   // Default reduction method: critical construct ( lck != NULL, like in current
7577   // PAROPT )
7578   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7579   // can be selected by RTL
7580   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7581   // can be selected by RTL
7582   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7583   // among generated by PAROPT.
7584 
7585   PACKED_REDUCTION_METHOD_T retval;
7586 
7587   int team_size;
7588 
7589   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7590   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7591 
7592 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7593   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7594 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7595 
7596   retval = critical_reduce_block;
7597 
7598   // another choice of getting a team size (with 1 dynamic deference) is slower
7599   team_size = __kmp_get_team_num_threads(global_tid);
7600   if (team_size == 1) {
7601 
7602     retval = empty_reduce_block;
7603 
7604   } else {
7605 
7606     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7607     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7608 
7609 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7610 
7611 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7612     KMP_OS_DARWIN
7613 
7614     int teamsize_cutoff = 4;
7615 
7616 #if KMP_MIC_SUPPORTED
7617     if (__kmp_mic_type != non_mic) {
7618       teamsize_cutoff = 8;
7619     }
7620 #endif
7621     if (tree_available) {
7622       if (team_size <= teamsize_cutoff) {
7623         if (atomic_available) {
7624           retval = atomic_reduce_block;
7625         }
7626       } else {
7627         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7628       }
7629     } else if (atomic_available) {
7630       retval = atomic_reduce_block;
7631     }
7632 #else
7633 #error "Unknown or unsupported OS"
7634 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7635 // KMP_OS_DARWIN
7636 
7637 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7638 
7639 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7640 
7641     // basic tuning
7642 
7643     if (atomic_available) {
7644       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7645         retval = atomic_reduce_block;
7646       }
7647     } // otherwise: use critical section
7648 
7649 #elif KMP_OS_DARWIN
7650 
7651     if (atomic_available && (num_vars <= 3)) {
7652       retval = atomic_reduce_block;
7653     } else if (tree_available) {
7654       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7655           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7656         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7657       }
7658     } // otherwise: use critical section
7659 
7660 #else
7661 #error "Unknown or unsupported OS"
7662 #endif
7663 
7664 #else
7665 #error "Unknown or unsupported architecture"
7666 #endif
7667   }
7668 
7669   // KMP_FORCE_REDUCTION
7670 
7671   // If the team is serialized (team_size == 1), ignore the forced reduction
7672   // method and stay with the unsynchronized method (empty_reduce_block)
7673   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7674       team_size != 1) {
7675 
7676     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7677 
7678     int atomic_available, tree_available;
7679 
7680     switch ((forced_retval = __kmp_force_reduction_method)) {
7681     case critical_reduce_block:
7682       KMP_ASSERT(lck); // lck should be != 0
7683       break;
7684 
7685     case atomic_reduce_block:
7686       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7687       if (!atomic_available) {
7688         KMP_WARNING(RedMethodNotSupported, "atomic");
7689         forced_retval = critical_reduce_block;
7690       }
7691       break;
7692 
7693     case tree_reduce_block:
7694       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7695       if (!tree_available) {
7696         KMP_WARNING(RedMethodNotSupported, "tree");
7697         forced_retval = critical_reduce_block;
7698       } else {
7699 #if KMP_FAST_REDUCTION_BARRIER
7700         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7701 #endif
7702       }
7703       break;
7704 
7705     default:
7706       KMP_ASSERT(0); // "unsupported method specified"
7707     }
7708 
7709     retval = forced_retval;
7710   }
7711 
7712   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7713 
7714 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7715 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7716 
7717   return (retval);
7718 }
7719 
7720 // this function is for testing set/get/determine reduce method
7721 kmp_int32 __kmp_get_reduce_method(void) {
7722   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7723 }
7724