1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_affinity.h"
18 #include "kmp_atomic.h"
19 #include "kmp_environment.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_io.h"
23 #include "kmp_itt.h"
24 #include "kmp_settings.h"
25 #include "kmp_stats.h"
26 #include "kmp_str.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_wrapper_getpid.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] =
45     KMP_VERSION_PREFIX "alternative compiler support: yes";
46 #endif /* defined(KMP_GOMP_COMPAT) */
47 
48 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
49 #if OMP_50_ENABLED
50                                                         "5.0 (201611)";
51 #elif OMP_45_ENABLED
52                                                         "4.5 (201511)";
53 #elif OMP_40_ENABLED
54                                                         "4.0 (201307)";
55 #else
56                                                         "3.1 (201107)";
57 #endif
58 
59 #ifdef KMP_DEBUG
60 char const __kmp_version_lock[] =
61     KMP_VERSION_PREFIX "lock type: run time selectable";
62 #endif /* KMP_DEBUG */
63 
64 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
65 
66 /* ------------------------------------------------------------------------ */
67 
68 kmp_info_t __kmp_monitor;
69 
70 /* Forward declarations */
71 
72 void __kmp_cleanup(void);
73 
74 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
75                                   int gtid);
76 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
77                                   kmp_internal_control_t *new_icvs,
78                                   ident_t *loc);
79 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
80 static void __kmp_partition_places(kmp_team_t *team,
81                                    int update_master_only = 0);
82 #endif
83 static void __kmp_do_serial_initialize(void);
84 void __kmp_fork_barrier(int gtid, int tid);
85 void __kmp_join_barrier(int gtid);
86 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
87                           kmp_internal_control_t *new_icvs, ident_t *loc);
88 
89 #ifdef USE_LOAD_BALANCE
90 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
91 #endif
92 
93 static int __kmp_expand_threads(int nWish, int nNeed);
94 #if KMP_OS_WINDOWS
95 static int __kmp_unregister_root_other_thread(int gtid);
96 #endif
97 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
98 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
99 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
100 
101 /* Calculate the identifier of the current thread */
102 /* fast (and somewhat portable) way to get unique identifier of executing
103    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
104 
105 int __kmp_get_global_thread_id() {
106   int i;
107   kmp_info_t **other_threads;
108   size_t stack_data;
109   char *stack_addr;
110   size_t stack_size;
111   char *stack_base;
112 
113   KA_TRACE(
114       1000,
115       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
116        __kmp_nth, __kmp_all_nth));
117 
118   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
119      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
120      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
121      __kmp_init_gtid for this to work. */
122 
123   if (!TCR_4(__kmp_init_gtid))
124     return KMP_GTID_DNE;
125 
126 #ifdef KMP_TDATA_GTID
127   if (TCR_4(__kmp_gtid_mode) >= 3) {
128     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
129     return __kmp_gtid;
130   }
131 #endif
132   if (TCR_4(__kmp_gtid_mode) >= 2) {
133     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
134     return __kmp_gtid_get_specific();
135   }
136   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
137 
138   stack_addr = (char *)&stack_data;
139   other_threads = __kmp_threads;
140 
141   /* ATT: The code below is a source of potential bugs due to unsynchronized
142      access to __kmp_threads array. For example:
143      1. Current thread loads other_threads[i] to thr and checks it, it is
144         non-NULL.
145      2. Current thread is suspended by OS.
146      3. Another thread unregisters and finishes (debug versions of free()
147         may fill memory with something like 0xEF).
148      4. Current thread is resumed.
149      5. Current thread reads junk from *thr.
150      TODO: Fix it.  --ln  */
151 
152   for (i = 0; i < __kmp_threads_capacity; i++) {
153 
154     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
155     if (!thr)
156       continue;
157 
158     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
159     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
160 
161     /* stack grows down -- search through all of the active threads */
162 
163     if (stack_addr <= stack_base) {
164       size_t stack_diff = stack_base - stack_addr;
165 
166       if (stack_diff <= stack_size) {
167         /* The only way we can be closer than the allocated */
168         /* stack size is if we are running on this thread. */
169         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
170         return i;
171       }
172     }
173   }
174 
175   /* get specific to try and determine our gtid */
176   KA_TRACE(1000,
177            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
178             "thread, using TLS\n"));
179   i = __kmp_gtid_get_specific();
180 
181   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
182 
183   /* if we havn't been assigned a gtid, then return code */
184   if (i < 0)
185     return i;
186 
187   /* dynamically updated stack window for uber threads to avoid get_specific
188      call */
189   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
190     KMP_FATAL(StackOverflow, i);
191   }
192 
193   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
194   if (stack_addr > stack_base) {
195     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
196     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
197             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
198                 stack_base);
199   } else {
200     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
201             stack_base - stack_addr);
202   }
203 
204   /* Reprint stack bounds for ubermaster since they have been refined */
205   if (__kmp_storage_map) {
206     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
207     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
208     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
209                                  other_threads[i]->th.th_info.ds.ds_stacksize,
210                                  "th_%d stack (refinement)", i);
211   }
212   return i;
213 }
214 
215 int __kmp_get_global_thread_id_reg() {
216   int gtid;
217 
218   if (!__kmp_init_serial) {
219     gtid = KMP_GTID_DNE;
220   } else
221 #ifdef KMP_TDATA_GTID
222       if (TCR_4(__kmp_gtid_mode) >= 3) {
223     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
224     gtid = __kmp_gtid;
225   } else
226 #endif
227       if (TCR_4(__kmp_gtid_mode) >= 2) {
228     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
229     gtid = __kmp_gtid_get_specific();
230   } else {
231     KA_TRACE(1000,
232              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
233     gtid = __kmp_get_global_thread_id();
234   }
235 
236   /* we must be a new uber master sibling thread */
237   if (gtid == KMP_GTID_DNE) {
238     KA_TRACE(10,
239              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
240               "Registering a new gtid.\n"));
241     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
242     if (!__kmp_init_serial) {
243       __kmp_do_serial_initialize();
244       gtid = __kmp_gtid_get_specific();
245     } else {
246       gtid = __kmp_register_root(FALSE);
247     }
248     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
249     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
250   }
251 
252   KMP_DEBUG_ASSERT(gtid >= 0);
253 
254   return gtid;
255 }
256 
257 /* caller must hold forkjoin_lock */
258 void __kmp_check_stack_overlap(kmp_info_t *th) {
259   int f;
260   char *stack_beg = NULL;
261   char *stack_end = NULL;
262   int gtid;
263 
264   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
265   if (__kmp_storage_map) {
266     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
267     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
268 
269     gtid = __kmp_gtid_from_thread(th);
270 
271     if (gtid == KMP_GTID_MONITOR) {
272       __kmp_print_storage_map_gtid(
273           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
274           "th_%s stack (%s)", "mon",
275           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
276     } else {
277       __kmp_print_storage_map_gtid(
278           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
279           "th_%d stack (%s)", gtid,
280           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
281     }
282   }
283 
284   /* No point in checking ubermaster threads since they use refinement and
285    * cannot overlap */
286   gtid = __kmp_gtid_from_thread(th);
287   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
288     KA_TRACE(10,
289              ("__kmp_check_stack_overlap: performing extensive checking\n"));
290     if (stack_beg == NULL) {
291       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
292       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
293     }
294 
295     for (f = 0; f < __kmp_threads_capacity; f++) {
296       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
297 
298       if (f_th && f_th != th) {
299         char *other_stack_end =
300             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
301         char *other_stack_beg =
302             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
303         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
304             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
305 
306           /* Print the other stack values before the abort */
307           if (__kmp_storage_map)
308             __kmp_print_storage_map_gtid(
309                 -1, other_stack_beg, other_stack_end,
310                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
311                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
312 
313           __kmp_msg(kmp_ms_fatal, KMP_MSG(StackOverlap),
314                     KMP_HNT(ChangeStackLimit), __kmp_msg_null);
315         }
316       }
317     }
318   }
319   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
320 }
321 
322 /* ------------------------------------------------------------------------ */
323 
324 void __kmp_infinite_loop(void) {
325   static int done = FALSE;
326 
327   while (!done) {
328     KMP_YIELD(1);
329   }
330 }
331 
332 #define MAX_MESSAGE 512
333 
334 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
335                                   char const *format, ...) {
336   char buffer[MAX_MESSAGE];
337   va_list ap;
338 
339   va_start(ap, format);
340   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
341                p2, (unsigned long)size, format);
342   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
343   __kmp_vprintf(kmp_err, buffer, ap);
344 #if KMP_PRINT_DATA_PLACEMENT
345   int node;
346   if (gtid >= 0) {
347     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
348       if (__kmp_storage_map_verbose) {
349         node = __kmp_get_host_node(p1);
350         if (node < 0) /* doesn't work, so don't try this next time */
351           __kmp_storage_map_verbose = FALSE;
352         else {
353           char *last;
354           int lastNode;
355           int localProc = __kmp_get_cpu_from_gtid(gtid);
356 
357           const int page_size = KMP_GET_PAGE_SIZE();
358 
359           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
360           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
361           if (localProc >= 0)
362             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
363                                  localProc >> 1);
364           else
365             __kmp_printf_no_lock("  GTID %d\n", gtid);
366 #if KMP_USE_PRCTL
367           /* The more elaborate format is disabled for now because of the prctl
368            * hanging bug. */
369           do {
370             last = p1;
371             lastNode = node;
372             /* This loop collates adjacent pages with the same host node. */
373             do {
374               (char *)p1 += page_size;
375             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
376             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
377                                  lastNode);
378           } while (p1 <= p2);
379 #else
380           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
381                                (char *)p1 + (page_size - 1),
382                                __kmp_get_host_node(p1));
383           if (p1 < p2) {
384             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
385                                  (char *)p2 + (page_size - 1),
386                                  __kmp_get_host_node(p2));
387           }
388 #endif
389         }
390       }
391     } else
392       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
393   }
394 #endif /* KMP_PRINT_DATA_PLACEMENT */
395   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
396 }
397 
398 void __kmp_warn(char const *format, ...) {
399   char buffer[MAX_MESSAGE];
400   va_list ap;
401 
402   if (__kmp_generate_warnings == kmp_warnings_off) {
403     return;
404   }
405 
406   va_start(ap, format);
407 
408   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
409   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
410   __kmp_vprintf(kmp_err, buffer, ap);
411   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
412 
413   va_end(ap);
414 }
415 
416 void __kmp_abort_process() {
417   // Later threads may stall here, but that's ok because abort() will kill them.
418   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
419 
420   if (__kmp_debug_buf) {
421     __kmp_dump_debug_buffer();
422   }; // if
423 
424   if (KMP_OS_WINDOWS) {
425     // Let other threads know of abnormal termination and prevent deadlock
426     // if abort happened during library initialization or shutdown
427     __kmp_global.g.g_abort = SIGABRT;
428 
429     /* On Windows* OS by default abort() causes pop-up error box, which stalls
430        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
431        boxes. _set_abort_behavior() works well, but this function is not
432        available in VS7 (this is not problem for DLL, but it is a problem for
433        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
434        help, at least in some versions of MS C RTL.
435 
436        It seems following sequence is the only way to simulate abort() and
437        avoid pop-up error box. */
438     raise(SIGABRT);
439     _exit(3); // Just in case, if signal ignored, exit anyway.
440   } else {
441     abort();
442   }; // if
443 
444   __kmp_infinite_loop();
445   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
446 
447 } // __kmp_abort_process
448 
449 void __kmp_abort_thread(void) {
450   // TODO: Eliminate g_abort global variable and this function.
451   // In case of abort just call abort(), it will kill all the threads.
452   __kmp_infinite_loop();
453 } // __kmp_abort_thread
454 
455 /* Print out the storage map for the major kmp_info_t thread data structures
456    that are allocated together. */
457 
458 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
459   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
460                                gtid);
461 
462   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
463                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
464 
465   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
466                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
467 
468   __kmp_print_storage_map_gtid(
469       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
470       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
471 
472   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
473                                &thr->th.th_bar[bs_plain_barrier + 1],
474                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
475                                gtid);
476 
477   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
478                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
479                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
480                                gtid);
481 
482 #if KMP_FAST_REDUCTION_BARRIER
483   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
484                                &thr->th.th_bar[bs_reduction_barrier + 1],
485                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
486                                gtid);
487 #endif // KMP_FAST_REDUCTION_BARRIER
488 }
489 
490 /* Print out the storage map for the major kmp_team_t team data structures
491    that are allocated together. */
492 
493 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
494                                          int team_id, int num_thr) {
495   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
496   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
497                                header, team_id);
498 
499   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
500                                &team->t.t_bar[bs_last_barrier],
501                                sizeof(kmp_balign_team_t) * bs_last_barrier,
502                                "%s_%d.t_bar", header, team_id);
503 
504   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
505                                &team->t.t_bar[bs_plain_barrier + 1],
506                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
507                                header, team_id);
508 
509   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
510                                &team->t.t_bar[bs_forkjoin_barrier + 1],
511                                sizeof(kmp_balign_team_t),
512                                "%s_%d.t_bar[forkjoin]", header, team_id);
513 
514 #if KMP_FAST_REDUCTION_BARRIER
515   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
516                                &team->t.t_bar[bs_reduction_barrier + 1],
517                                sizeof(kmp_balign_team_t),
518                                "%s_%d.t_bar[reduction]", header, team_id);
519 #endif // KMP_FAST_REDUCTION_BARRIER
520 
521   __kmp_print_storage_map_gtid(
522       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
523       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
524 
525   __kmp_print_storage_map_gtid(
526       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
527       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
528 
529   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
530                                &team->t.t_disp_buffer[num_disp_buff],
531                                sizeof(dispatch_shared_info_t) * num_disp_buff,
532                                "%s_%d.t_disp_buffer", header, team_id);
533 
534   __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
535                                sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
536                                team_id);
537 }
538 
539 static void __kmp_init_allocator() {}
540 static void __kmp_fini_allocator() {}
541 
542 /* ------------------------------------------------------------------------ */
543 
544 #ifdef KMP_DYNAMIC_LIB
545 #if KMP_OS_WINDOWS
546 
547 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
548   // TODO: Change to __kmp_break_bootstrap_lock().
549   __kmp_init_bootstrap_lock(lck); // make the lock released
550 }
551 
552 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
553   int i;
554   int thread_count;
555 
556   // PROCESS_DETACH is expected to be called by a thread that executes
557   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
558   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
559   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
560   // threads can be still alive here, although being about to be terminated. The
561   // threads in the array with ds_thread==0 are most suspicious. Actually, it
562   // can be not safe to access the __kmp_threads[].
563 
564   // TODO: does it make sense to check __kmp_roots[] ?
565 
566   // Let's check that there are no other alive threads registered with the OMP
567   // lib.
568   while (1) {
569     thread_count = 0;
570     for (i = 0; i < __kmp_threads_capacity; ++i) {
571       if (!__kmp_threads)
572         continue;
573       kmp_info_t *th = __kmp_threads[i];
574       if (th == NULL)
575         continue;
576       int gtid = th->th.th_info.ds.ds_gtid;
577       if (gtid == gtid_req)
578         continue;
579       if (gtid < 0)
580         continue;
581       DWORD exit_val;
582       int alive = __kmp_is_thread_alive(th, &exit_val);
583       if (alive) {
584         ++thread_count;
585       }
586     }
587     if (thread_count == 0)
588       break; // success
589   }
590 
591   // Assume that I'm alone. Now it might be safe to check and reset locks.
592   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
593   __kmp_reset_lock(&__kmp_forkjoin_lock);
594 #ifdef KMP_DEBUG
595   __kmp_reset_lock(&__kmp_stdio_lock);
596 #endif // KMP_DEBUG
597 }
598 
599 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
600   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
601 
602   switch (fdwReason) {
603 
604   case DLL_PROCESS_ATTACH:
605     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
606 
607     return TRUE;
608 
609   case DLL_PROCESS_DETACH:
610     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
611 
612     if (lpReserved != NULL) {
613       // lpReserved is used for telling the difference:
614       //   lpReserved == NULL when FreeLibrary() was called,
615       //   lpReserved != NULL when the process terminates.
616       // When FreeLibrary() is called, worker threads remain alive. So they will
617       // release the forkjoin lock by themselves. When the process terminates,
618       // worker threads disappear triggering the problem of unreleased forkjoin
619       // lock as described below.
620 
621       // A worker thread can take the forkjoin lock. The problem comes up if
622       // that worker thread becomes dead before it releases the forkjoin lock.
623       // The forkjoin lock remains taken, while the thread executing
624       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
625       // to take the forkjoin lock and will always fail, so that the application
626       // will never finish [normally]. This scenario is possible if
627       // __kmpc_end() has not been executed. It looks like it's not a corner
628       // case, but common cases:
629       // - the main function was compiled by an alternative compiler;
630       // - the main function was compiled by icl but without /Qopenmp
631       //   (application with plugins);
632       // - application terminates by calling C exit(), Fortran CALL EXIT() or
633       //   Fortran STOP.
634       // - alive foreign thread prevented __kmpc_end from doing cleanup.
635       //
636       // This is a hack to work around the problem.
637       // TODO: !!! figure out something better.
638       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
639     }
640 
641     __kmp_internal_end_library(__kmp_gtid_get_specific());
642 
643     return TRUE;
644 
645   case DLL_THREAD_ATTACH:
646     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
647 
648     /* if we want to register new siblings all the time here call
649      * __kmp_get_gtid(); */
650     return TRUE;
651 
652   case DLL_THREAD_DETACH:
653     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
654 
655     __kmp_internal_end_thread(__kmp_gtid_get_specific());
656     return TRUE;
657   }
658 
659   return TRUE;
660 }
661 
662 #endif /* KMP_OS_WINDOWS */
663 #endif /* KMP_DYNAMIC_LIB */
664 
665 /* Change the library type to "status" and return the old type */
666 /* called from within initialization routines where __kmp_initz_lock is held */
667 int __kmp_change_library(int status) {
668   int old_status;
669 
670   old_status = __kmp_yield_init &
671                1; // check whether KMP_LIBRARY=throughput (even init count)
672 
673   if (status) {
674     __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
675   } else {
676     __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
677   }
678 
679   return old_status; // return previous setting of whether
680   // KMP_LIBRARY=throughput
681 }
682 
683 /* __kmp_parallel_deo -- Wait until it's our turn. */
684 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
685   int gtid = *gtid_ref;
686 #ifdef BUILD_PARALLEL_ORDERED
687   kmp_team_t *team = __kmp_team_from_gtid(gtid);
688 #endif /* BUILD_PARALLEL_ORDERED */
689 
690   if (__kmp_env_consistency_check) {
691     if (__kmp_threads[gtid]->th.th_root->r.r_active)
692 #if KMP_USE_DYNAMIC_LOCK
693       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
694 #else
695       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
696 #endif
697   }
698 #ifdef BUILD_PARALLEL_ORDERED
699   if (!team->t.t_serialized) {
700     KMP_MB();
701     KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
702                    KMP_EQ, NULL);
703     KMP_MB();
704   }
705 #endif /* BUILD_PARALLEL_ORDERED */
706 }
707 
708 /* __kmp_parallel_dxo -- Signal the next task. */
709 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
710   int gtid = *gtid_ref;
711 #ifdef BUILD_PARALLEL_ORDERED
712   int tid = __kmp_tid_from_gtid(gtid);
713   kmp_team_t *team = __kmp_team_from_gtid(gtid);
714 #endif /* BUILD_PARALLEL_ORDERED */
715 
716   if (__kmp_env_consistency_check) {
717     if (__kmp_threads[gtid]->th.th_root->r.r_active)
718       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
719   }
720 #ifdef BUILD_PARALLEL_ORDERED
721   if (!team->t.t_serialized) {
722     KMP_MB(); /* Flush all pending memory write invalidates.  */
723 
724     /* use the tid of the next thread in this team */
725     /* TODO replace with general release procedure */
726     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
727 
728 #if OMPT_SUPPORT && OMPT_BLAME
729     if (ompt_enabled &&
730         ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
731       /* accept blame for "ordered" waiting */
732       kmp_info_t *this_thread = __kmp_threads[gtid];
733       ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
734           this_thread->th.ompt_thread_info.wait_id);
735     }
736 #endif
737 
738     KMP_MB(); /* Flush all pending memory write invalidates.  */
739   }
740 #endif /* BUILD_PARALLEL_ORDERED */
741 }
742 
743 /* ------------------------------------------------------------------------ */
744 /* The BARRIER for a SINGLE process section is always explicit   */
745 
746 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
747   int status;
748   kmp_info_t *th;
749   kmp_team_t *team;
750 
751   if (!TCR_4(__kmp_init_parallel))
752     __kmp_parallel_initialize();
753 
754   th = __kmp_threads[gtid];
755   team = th->th.th_team;
756   status = 0;
757 
758   th->th.th_ident = id_ref;
759 
760   if (team->t.t_serialized) {
761     status = 1;
762   } else {
763     kmp_int32 old_this = th->th.th_local.this_construct;
764 
765     ++th->th.th_local.this_construct;
766     /* try to set team count to thread count--success means thread got the
767        single block */
768     /* TODO: Should this be acquire or release? */
769     if (team->t.t_construct == old_this) {
770       status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
771                                            th->th.th_local.this_construct);
772     }
773 #if USE_ITT_BUILD
774     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
775         KMP_MASTER_GTID(gtid) &&
776 #if OMP_40_ENABLED
777         th->th.th_teams_microtask == NULL &&
778 #endif
779         team->t.t_active_level ==
780             1) { // Only report metadata by master of active team at level 1
781       __kmp_itt_metadata_single(id_ref);
782     }
783 #endif /* USE_ITT_BUILD */
784   }
785 
786   if (__kmp_env_consistency_check) {
787     if (status && push_ws) {
788       __kmp_push_workshare(gtid, ct_psingle, id_ref);
789     } else {
790       __kmp_check_workshare(gtid, ct_psingle, id_ref);
791     }
792   }
793 #if USE_ITT_BUILD
794   if (status) {
795     __kmp_itt_single_start(gtid);
796   }
797 #endif /* USE_ITT_BUILD */
798   return status;
799 }
800 
801 void __kmp_exit_single(int gtid) {
802 #if USE_ITT_BUILD
803   __kmp_itt_single_end(gtid);
804 #endif /* USE_ITT_BUILD */
805   if (__kmp_env_consistency_check)
806     __kmp_pop_workshare(gtid, ct_psingle, NULL);
807 }
808 
809 /* determine if we can go parallel or must use a serialized parallel region and
810  * how many threads we can use
811  * set_nproc is the number of threads requested for the team
812  * returns 0 if we should serialize or only use one thread,
813  * otherwise the number of threads to use
814  * The forkjoin lock is held by the caller. */
815 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
816                                  int master_tid, int set_nthreads
817 #if OMP_40_ENABLED
818                                  ,
819                                  int enter_teams
820 #endif /* OMP_40_ENABLED */
821                                  ) {
822   int capacity;
823   int new_nthreads;
824   KMP_DEBUG_ASSERT(__kmp_init_serial);
825   KMP_DEBUG_ASSERT(root && parent_team);
826 
827   // If dyn-var is set, dynamically adjust the number of desired threads,
828   // according to the method specified by dynamic_mode.
829   new_nthreads = set_nthreads;
830   if (!get__dynamic_2(parent_team, master_tid)) {
831     ;
832   }
833 #ifdef USE_LOAD_BALANCE
834   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
835     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
836     if (new_nthreads == 1) {
837       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
838                     "reservation to 1 thread\n",
839                     master_tid));
840       return 1;
841     }
842     if (new_nthreads < set_nthreads) {
843       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
844                     "reservation to %d threads\n",
845                     master_tid, new_nthreads));
846     }
847   }
848 #endif /* USE_LOAD_BALANCE */
849   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
850     new_nthreads = __kmp_avail_proc - __kmp_nth +
851                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
852     if (new_nthreads <= 1) {
853       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
854                     "reservation to 1 thread\n",
855                     master_tid));
856       return 1;
857     }
858     if (new_nthreads < set_nthreads) {
859       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
860                     "reservation to %d threads\n",
861                     master_tid, new_nthreads));
862     } else {
863       new_nthreads = set_nthreads;
864     }
865   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
866     if (set_nthreads > 2) {
867       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
868       new_nthreads = (new_nthreads % set_nthreads) + 1;
869       if (new_nthreads == 1) {
870         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
871                       "reservation to 1 thread\n",
872                       master_tid));
873         return 1;
874       }
875       if (new_nthreads < set_nthreads) {
876         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
877                       "reservation to %d threads\n",
878                       master_tid, new_nthreads));
879       }
880     }
881   } else {
882     KMP_ASSERT(0);
883   }
884 
885   // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
886   if (__kmp_nth + new_nthreads -
887           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
888       __kmp_max_nth) {
889     int tl_nthreads = __kmp_max_nth - __kmp_nth +
890                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
891     if (tl_nthreads <= 0) {
892       tl_nthreads = 1;
893     }
894 
895     // If dyn-var is false, emit a 1-time warning.
896     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
897       __kmp_reserve_warn = 1;
898       __kmp_msg(kmp_ms_warning,
899                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
900                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
901     }
902     if (tl_nthreads == 1) {
903       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced "
904                     "reservation to 1 thread\n",
905                     master_tid));
906       return 1;
907     }
908     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced "
909                   "reservation to %d threads\n",
910                   master_tid, tl_nthreads));
911     new_nthreads = tl_nthreads;
912   }
913 
914   // Check if the threads array is large enough, or needs expanding.
915   //
916   // See comment in __kmp_register_root() about the adjustment if
917   // __kmp_threads[0] == NULL.
918   capacity = __kmp_threads_capacity;
919   if (TCR_PTR(__kmp_threads[0]) == NULL) {
920     --capacity;
921   }
922   if (__kmp_nth + new_nthreads -
923           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
924       capacity) {
925     // Expand the threads array.
926     int slotsRequired = __kmp_nth + new_nthreads -
927                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
928                         capacity;
929     int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
930     if (slotsAdded < slotsRequired) {
931       // The threads array was not expanded enough.
932       new_nthreads -= (slotsRequired - slotsAdded);
933       KMP_ASSERT(new_nthreads >= 1);
934 
935       // If dyn-var is false, emit a 1-time warning.
936       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
937         __kmp_reserve_warn = 1;
938         if (__kmp_tp_cached) {
939           __kmp_msg(kmp_ms_warning,
940                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
941                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
942                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
943         } else {
944           __kmp_msg(kmp_ms_warning,
945                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
946                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
947         }
948       }
949     }
950   }
951 
952   if (new_nthreads == 1) {
953     KC_TRACE(10,
954              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
955               "dead roots and rechecking; requested %d threads\n",
956               __kmp_get_gtid(), set_nthreads));
957     return 1;
958   }
959 
960   KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested "
961                 "%d threads\n",
962                 __kmp_get_gtid(), new_nthreads, set_nthreads));
963   return new_nthreads;
964 }
965 
966 /* Allocate threads from the thread pool and assign them to the new team. We are
967    assured that there are enough threads available, because we checked on that
968    earlier within critical section forkjoin */
969 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
970                                     kmp_info_t *master_th, int master_gtid) {
971   int i;
972   int use_hot_team;
973 
974   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
975   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
976   KMP_MB();
977 
978   /* first, let's setup the master thread */
979   master_th->th.th_info.ds.ds_tid = 0;
980   master_th->th.th_team = team;
981   master_th->th.th_team_nproc = team->t.t_nproc;
982   master_th->th.th_team_master = master_th;
983   master_th->th.th_team_serialized = FALSE;
984   master_th->th.th_dispatch = &team->t.t_dispatch[0];
985 
986 /* make sure we are not the optimized hot team */
987 #if KMP_NESTED_HOT_TEAMS
988   use_hot_team = 0;
989   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
990   if (hot_teams) { // hot teams array is not allocated if
991     // KMP_HOT_TEAMS_MAX_LEVEL=0
992     int level = team->t.t_active_level - 1; // index in array of hot teams
993     if (master_th->th.th_teams_microtask) { // are we inside the teams?
994       if (master_th->th.th_teams_size.nteams > 1) {
995         ++level; // level was not increased in teams construct for
996         // team_of_masters
997       }
998       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
999           master_th->th.th_teams_level == team->t.t_level) {
1000         ++level; // level was not increased in teams construct for
1001         // team_of_workers before the parallel
1002       } // team->t.t_level will be increased inside parallel
1003     }
1004     if (level < __kmp_hot_teams_max_level) {
1005       if (hot_teams[level].hot_team) {
1006         // hot team has already been allocated for given level
1007         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1008         use_hot_team = 1; // the team is ready to use
1009       } else {
1010         use_hot_team = 0; // AC: threads are not allocated yet
1011         hot_teams[level].hot_team = team; // remember new hot team
1012         hot_teams[level].hot_team_nth = team->t.t_nproc;
1013       }
1014     } else {
1015       use_hot_team = 0;
1016     }
1017   }
1018 #else
1019   use_hot_team = team == root->r.r_hot_team;
1020 #endif
1021   if (!use_hot_team) {
1022 
1023     /* install the master thread */
1024     team->t.t_threads[0] = master_th;
1025     __kmp_initialize_info(master_th, team, 0, master_gtid);
1026 
1027     /* now, install the worker threads */
1028     for (i = 1; i < team->t.t_nproc; i++) {
1029 
1030       /* fork or reallocate a new thread and install it in team */
1031       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1032       team->t.t_threads[i] = thr;
1033       KMP_DEBUG_ASSERT(thr);
1034       KMP_DEBUG_ASSERT(thr->th.th_team == team);
1035       /* align team and thread arrived states */
1036       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1037                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
1038                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1039                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1040                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1041                     team->t.t_bar[bs_plain_barrier].b_arrived));
1042 #if OMP_40_ENABLED
1043       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1044       thr->th.th_teams_level = master_th->th.th_teams_level;
1045       thr->th.th_teams_size = master_th->th.th_teams_size;
1046 #endif
1047       { // Initialize threads' barrier data.
1048         int b;
1049         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1050         for (b = 0; b < bs_last_barrier; ++b) {
1051           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1052           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1053 #if USE_DEBUGGER
1054           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1055 #endif
1056         }; // for b
1057       }
1058     }
1059 
1060 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1061     __kmp_partition_places(team);
1062 #endif
1063   }
1064 
1065   KMP_MB();
1066 }
1067 
1068 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1069 // Propagate any changes to the floating point control registers out to the team
1070 // We try to avoid unnecessary writes to the relevant cache line in the team
1071 // structure, so we don't make changes unless they are needed.
1072 inline static void propagateFPControl(kmp_team_t *team) {
1073   if (__kmp_inherit_fp_control) {
1074     kmp_int16 x87_fpu_control_word;
1075     kmp_uint32 mxcsr;
1076 
1077     // Get master values of FPU control flags (both X87 and vector)
1078     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1079     __kmp_store_mxcsr(&mxcsr);
1080     mxcsr &= KMP_X86_MXCSR_MASK;
1081 
1082 // There is no point looking at t_fp_control_saved here.
1083 // If it is TRUE, we still have to update the values if they are different from
1084 // those we now have.
1085 // If it is FALSE we didn't save anything yet, but our objective is the same. We
1086 // have to ensure that the values in the team are the same as those we have.
1087 // So, this code achieves what we need whether or not t_fp_control_saved is
1088 // true. By checking whether the value needs updating we avoid unnecessary
1089 // writes that would put the cache-line into a written state, causing all
1090 // threads in the team to have to read it again.
1091     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1092     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1093     // Although we don't use this value, other code in the runtime wants to know
1094     // whether it should restore them. So we must ensure it is correct.
1095     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1096   } else {
1097     // Similarly here. Don't write to this cache-line in the team structure
1098     // unless we have to.
1099     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1100   }
1101 }
1102 
1103 // Do the opposite, setting the hardware registers to the updated values from
1104 // the team.
1105 inline static void updateHWFPControl(kmp_team_t *team) {
1106   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1107     // Only reset the fp control regs if they have been changed in the team.
1108     // the parallel region that we are exiting.
1109     kmp_int16 x87_fpu_control_word;
1110     kmp_uint32 mxcsr;
1111     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1112     __kmp_store_mxcsr(&mxcsr);
1113     mxcsr &= KMP_X86_MXCSR_MASK;
1114 
1115     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1116       __kmp_clear_x87_fpu_status_word();
1117       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1118     }
1119 
1120     if (team->t.t_mxcsr != mxcsr) {
1121       __kmp_load_mxcsr(&team->t.t_mxcsr);
1122     }
1123   }
1124 }
1125 #else
1126 #define propagateFPControl(x) ((void)0)
1127 #define updateHWFPControl(x) ((void)0)
1128 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1129 
1130 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1131                                      int realloc); // forward declaration
1132 
1133 /* Run a parallel region that has been serialized, so runs only in a team of the
1134    single master thread. */
1135 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1136   kmp_info_t *this_thr;
1137   kmp_team_t *serial_team;
1138 
1139   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1140 
1141   /* Skip all this code for autopar serialized loops since it results in
1142      unacceptable overhead */
1143   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1144     return;
1145 
1146   if (!TCR_4(__kmp_init_parallel))
1147     __kmp_parallel_initialize();
1148 
1149   this_thr = __kmp_threads[global_tid];
1150   serial_team = this_thr->th.th_serial_team;
1151 
1152   /* utilize the serialized team held by this thread */
1153   KMP_DEBUG_ASSERT(serial_team);
1154   KMP_MB();
1155 
1156   if (__kmp_tasking_mode != tskm_immediate_exec) {
1157     KMP_DEBUG_ASSERT(
1158         this_thr->th.th_task_team ==
1159         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1160     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1161                      NULL);
1162     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1163                   "team %p, new task_team = NULL\n",
1164                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1165     this_thr->th.th_task_team = NULL;
1166   }
1167 
1168 #if OMP_40_ENABLED
1169   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1170   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1171     proc_bind = proc_bind_false;
1172   } else if (proc_bind == proc_bind_default) {
1173     // No proc_bind clause was specified, so use the current value
1174     // of proc-bind-var for this parallel region.
1175     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1176   }
1177   // Reset for next parallel region
1178   this_thr->th.th_set_proc_bind = proc_bind_default;
1179 #endif /* OMP_40_ENABLED */
1180 
1181   if (this_thr->th.th_team != serial_team) {
1182     // Nested level will be an index in the nested nthreads array
1183     int level = this_thr->th.th_team->t.t_level;
1184 
1185     if (serial_team->t.t_serialized) {
1186       /* this serial team was already used
1187          TODO increase performance by making this locks more specific */
1188       kmp_team_t *new_team;
1189 
1190       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1191 
1192 #if OMPT_SUPPORT
1193       ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1194 #endif
1195 
1196       new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1197 #if OMPT_SUPPORT
1198                                      ompt_parallel_id,
1199 #endif
1200 #if OMP_40_ENABLED
1201                                      proc_bind,
1202 #endif
1203                                      &this_thr->th.th_current_task->td_icvs,
1204                                      0 USE_NESTED_HOT_ARG(NULL));
1205       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1206       KMP_ASSERT(new_team);
1207 
1208       /* setup new serialized team and install it */
1209       new_team->t.t_threads[0] = this_thr;
1210       new_team->t.t_parent = this_thr->th.th_team;
1211       serial_team = new_team;
1212       this_thr->th.th_serial_team = serial_team;
1213 
1214       KF_TRACE(
1215           10,
1216           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1217            global_tid, serial_team));
1218 
1219       /* TODO the above breaks the requirement that if we run out of resources,
1220          then we can still guarantee that serialized teams are ok, since we may
1221          need to allocate a new one */
1222     } else {
1223       KF_TRACE(
1224           10,
1225           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1226            global_tid, serial_team));
1227     }
1228 
1229     /* we have to initialize this serial team */
1230     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1231     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1232     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1233     serial_team->t.t_ident = loc;
1234     serial_team->t.t_serialized = 1;
1235     serial_team->t.t_nproc = 1;
1236     serial_team->t.t_parent = this_thr->th.th_team;
1237     serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1238     this_thr->th.th_team = serial_team;
1239     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1240 
1241     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1242                   this_thr->th.th_current_task));
1243     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1244     this_thr->th.th_current_task->td_flags.executing = 0;
1245 
1246     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1247 
1248     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1249        implicit task for each serialized task represented by
1250        team->t.t_serialized? */
1251     copy_icvs(&this_thr->th.th_current_task->td_icvs,
1252               &this_thr->th.th_current_task->td_parent->td_icvs);
1253 
1254     // Thread value exists in the nested nthreads array for the next nested
1255     // level
1256     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1257       this_thr->th.th_current_task->td_icvs.nproc =
1258           __kmp_nested_nth.nth[level + 1];
1259     }
1260 
1261 #if OMP_40_ENABLED
1262     if (__kmp_nested_proc_bind.used &&
1263         (level + 1 < __kmp_nested_proc_bind.used)) {
1264       this_thr->th.th_current_task->td_icvs.proc_bind =
1265           __kmp_nested_proc_bind.bind_types[level + 1];
1266     }
1267 #endif /* OMP_40_ENABLED */
1268 
1269 #if USE_DEBUGGER
1270     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1271 #endif
1272     this_thr->th.th_info.ds.ds_tid = 0;
1273 
1274     /* set thread cache values */
1275     this_thr->th.th_team_nproc = 1;
1276     this_thr->th.th_team_master = this_thr;
1277     this_thr->th.th_team_serialized = 1;
1278 
1279     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1280     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1281 
1282     propagateFPControl(serial_team);
1283 
1284     /* check if we need to allocate dispatch buffers stack */
1285     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1286     if (!serial_team->t.t_dispatch->th_disp_buffer) {
1287       serial_team->t.t_dispatch->th_disp_buffer =
1288           (dispatch_private_info_t *)__kmp_allocate(
1289               sizeof(dispatch_private_info_t));
1290     }
1291     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1292 
1293 #if OMPT_SUPPORT
1294     ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1295     __ompt_team_assign_id(serial_team, ompt_parallel_id);
1296 #endif
1297 
1298     KMP_MB();
1299 
1300   } else {
1301     /* this serialized team is already being used,
1302      * that's fine, just add another nested level */
1303     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1304     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1305     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1306     ++serial_team->t.t_serialized;
1307     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1308 
1309     // Nested level will be an index in the nested nthreads array
1310     int level = this_thr->th.th_team->t.t_level;
1311     // Thread value exists in the nested nthreads array for the next nested
1312     // level
1313     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1314       this_thr->th.th_current_task->td_icvs.nproc =
1315           __kmp_nested_nth.nth[level + 1];
1316     }
1317     serial_team->t.t_level++;
1318     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1319                   "of serial team %p to %d\n",
1320                   global_tid, serial_team, serial_team->t.t_level));
1321 
1322     /* allocate/push dispatch buffers stack */
1323     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1324     {
1325       dispatch_private_info_t *disp_buffer =
1326           (dispatch_private_info_t *)__kmp_allocate(
1327               sizeof(dispatch_private_info_t));
1328       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1329       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1330     }
1331     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1332 
1333     KMP_MB();
1334   }
1335 #if OMP_40_ENABLED
1336   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1337 #endif
1338 
1339   if (__kmp_env_consistency_check)
1340     __kmp_push_parallel(global_tid, NULL);
1341 }
1342 
1343 /* most of the work for a fork */
1344 /* return true if we really went parallel, false if serialized */
1345 int __kmp_fork_call(ident_t *loc, int gtid,
1346                     enum fork_context_e call_context, // Intel, GNU, ...
1347                     kmp_int32 argc,
1348 #if OMPT_SUPPORT
1349                     void *unwrapped_task,
1350 #endif
1351                     microtask_t microtask, launch_t invoker,
1352 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1353 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1354                     va_list *ap
1355 #else
1356                     va_list ap
1357 #endif
1358                     ) {
1359   void **argv;
1360   int i;
1361   int master_tid;
1362   int master_this_cons;
1363   kmp_team_t *team;
1364   kmp_team_t *parent_team;
1365   kmp_info_t *master_th;
1366   kmp_root_t *root;
1367   int nthreads;
1368   int master_active;
1369   int master_set_numthreads;
1370   int level;
1371 #if OMP_40_ENABLED
1372   int active_level;
1373   int teams_level;
1374 #endif
1375 #if KMP_NESTED_HOT_TEAMS
1376   kmp_hot_team_ptr_t **p_hot_teams;
1377 #endif
1378   { // KMP_TIME_BLOCK
1379     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1380     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1381 
1382     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1383     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1384       /* Some systems prefer the stack for the root thread(s) to start with */
1385       /* some gap from the parent stack to prevent false sharing. */
1386       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1387       /* These 2 lines below are so this does not get optimized out */
1388       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1389         __kmp_stkpadding += (short)((kmp_int64)dummy);
1390     }
1391 
1392     /* initialize if needed */
1393     KMP_DEBUG_ASSERT(
1394         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1395     if (!TCR_4(__kmp_init_parallel))
1396       __kmp_parallel_initialize();
1397 
1398     /* setup current data */
1399     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1400     // shutdown
1401     parent_team = master_th->th.th_team;
1402     master_tid = master_th->th.th_info.ds.ds_tid;
1403     master_this_cons = master_th->th.th_local.this_construct;
1404     root = master_th->th.th_root;
1405     master_active = root->r.r_active;
1406     master_set_numthreads = master_th->th.th_set_nproc;
1407 
1408 #if OMPT_SUPPORT
1409     ompt_parallel_id_t ompt_parallel_id;
1410     ompt_task_id_t ompt_task_id;
1411     ompt_frame_t *ompt_frame;
1412     ompt_task_id_t my_task_id;
1413     ompt_parallel_id_t my_parallel_id;
1414 
1415     if (ompt_enabled) {
1416       ompt_parallel_id = __ompt_parallel_id_new(gtid);
1417       ompt_task_id = __ompt_get_task_id_internal(0);
1418       ompt_frame = __ompt_get_task_frame_internal(0);
1419     }
1420 #endif
1421 
1422     // Nested level will be an index in the nested nthreads array
1423     level = parent_team->t.t_level;
1424     // used to launch non-serial teams even if nested is not allowed
1425     active_level = parent_team->t.t_active_level;
1426 #if OMP_40_ENABLED
1427     teams_level =
1428         master_th->th
1429             .th_teams_level; // needed to check nesting inside the teams
1430 #endif
1431 #if KMP_NESTED_HOT_TEAMS
1432     p_hot_teams = &master_th->th.th_hot_teams;
1433     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1434       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1435           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1436       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1437       (*p_hot_teams)[0].hot_team_nth =
1438           1; // it is either actual or not needed (when active_level > 0)
1439     }
1440 #endif
1441 
1442 #if OMPT_SUPPORT
1443     if (ompt_enabled &&
1444         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1445       int team_size = master_set_numthreads;
1446 
1447       ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1448           ompt_task_id, ompt_frame, ompt_parallel_id, team_size, unwrapped_task,
1449           OMPT_INVOKER(call_context));
1450     }
1451 #endif
1452 
1453     master_th->th.th_ident = loc;
1454 
1455 #if OMP_40_ENABLED
1456     if (master_th->th.th_teams_microtask && ap &&
1457         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1458       // AC: This is start of parallel that is nested inside teams construct.
1459       // The team is actual (hot), all workers are ready at the fork barrier.
1460       // No lock needed to initialize the team a bit, then free workers.
1461       parent_team->t.t_ident = loc;
1462       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1463       parent_team->t.t_argc = argc;
1464       argv = (void **)parent_team->t.t_argv;
1465       for (i = argc - 1; i >= 0; --i)
1466 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1467 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1468         *argv++ = va_arg(*ap, void *);
1469 #else
1470         *argv++ = va_arg(ap, void *);
1471 #endif
1472       // Increment our nested depth levels, but not increase the serialization
1473       if (parent_team == master_th->th.th_serial_team) {
1474         // AC: we are in serialized parallel
1475         __kmpc_serialized_parallel(loc, gtid);
1476         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477         // AC: need this in order enquiry functions work
1478         // correctly, will restore at join time
1479         parent_team->t.t_serialized--;
1480 #if OMPT_SUPPORT
1481         void *dummy;
1482         void **exit_runtime_p;
1483 
1484         ompt_lw_taskteam_t lw_taskteam;
1485 
1486         if (ompt_enabled) {
1487           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, unwrapped_task,
1488                                   ompt_parallel_id);
1489           lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1490           exit_runtime_p =
1491               &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1492 
1493           __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1494 
1495 #if OMPT_TRACE
1496           /* OMPT implicit task begin */
1497           my_task_id = lw_taskteam.ompt_task_info.task_id;
1498           my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1499           if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1500             ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1501                 my_parallel_id, my_task_id);
1502           }
1503 #endif
1504 
1505           /* OMPT state */
1506           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1507         } else {
1508           exit_runtime_p = &dummy;
1509         }
1510 #endif
1511 
1512         {
1513           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1514           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1515           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1516 #if OMPT_SUPPORT
1517                                  ,
1518                                  exit_runtime_p
1519 #endif
1520                                  );
1521         }
1522 
1523 #if OMPT_SUPPORT
1524         *exit_runtime_p = NULL;
1525         if (ompt_enabled) {
1526 #if OMPT_TRACE
1527           lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1528 
1529           if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1530             ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1531                 ompt_parallel_id, ompt_task_id);
1532           }
1533 
1534           __ompt_lw_taskteam_unlink(master_th);
1535           // reset clear the task id only after unlinking the task
1536           lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1537 #endif
1538 
1539           if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1540             ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1541                 ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1542           }
1543           master_th->th.ompt_thread_info.state = ompt_state_overhead;
1544         }
1545 #endif
1546         return TRUE;
1547       }
1548 
1549       parent_team->t.t_pkfn = microtask;
1550 #if OMPT_SUPPORT
1551       parent_team->t.ompt_team_info.microtask = unwrapped_task;
1552 #endif
1553       parent_team->t.t_invoke = invoker;
1554       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1555       parent_team->t.t_active_level++;
1556       parent_team->t.t_level++;
1557 
1558       /* Change number of threads in the team if requested */
1559       if (master_set_numthreads) { // The parallel has num_threads clause
1560         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1561           // AC: only can reduce number of threads dynamically, can't increase
1562           kmp_info_t **other_threads = parent_team->t.t_threads;
1563           parent_team->t.t_nproc = master_set_numthreads;
1564           for (i = 0; i < master_set_numthreads; ++i) {
1565             other_threads[i]->th.th_team_nproc = master_set_numthreads;
1566           }
1567           // Keep extra threads hot in the team for possible next parallels
1568         }
1569         master_th->th.th_set_nproc = 0;
1570       }
1571 
1572 #if USE_DEBUGGER
1573       if (__kmp_debugging) { // Let debugger override number of threads.
1574         int nth = __kmp_omp_num_threads(loc);
1575         if (nth >
1576             0) { // 0 means debugger does not want to change number of threads.
1577           master_set_numthreads = nth;
1578         }; // if
1579       }; // if
1580 #endif
1581 
1582       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1583                     "master_th=%p, gtid=%d\n",
1584                     root, parent_team, master_th, gtid));
1585       __kmp_internal_fork(loc, gtid, parent_team);
1586       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1587                     "master_th=%p, gtid=%d\n",
1588                     root, parent_team, master_th, gtid));
1589 
1590       /* Invoke microtask for MASTER thread */
1591       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1592                     parent_team->t.t_id, parent_team->t.t_pkfn));
1593 
1594       {
1595         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1596         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1597         if (!parent_team->t.t_invoke(gtid)) {
1598           KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1599         }
1600       }
1601       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1602                     parent_team->t.t_id, parent_team->t.t_pkfn));
1603       KMP_MB(); /* Flush all pending memory write invalidates.  */
1604 
1605       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1606 
1607       return TRUE;
1608     } // Parallel closely nested in teams construct
1609 #endif /* OMP_40_ENABLED */
1610 
1611 #if KMP_DEBUG
1612     if (__kmp_tasking_mode != tskm_immediate_exec) {
1613       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1614                        parent_team->t.t_task_team[master_th->th.th_task_state]);
1615     }
1616 #endif
1617 
1618     if (parent_team->t.t_active_level >=
1619         master_th->th.th_current_task->td_icvs.max_active_levels) {
1620       nthreads = 1;
1621     } else {
1622 #if OMP_40_ENABLED
1623       int enter_teams = ((ap == NULL && active_level == 0) ||
1624                          (ap && teams_level > 0 && teams_level == level));
1625 #endif
1626       nthreads =
1627           master_set_numthreads
1628               ? master_set_numthreads
1629               : get__nproc_2(
1630                     parent_team,
1631                     master_tid); // TODO: get nproc directly from current task
1632 
1633       // Check if we need to take forkjoin lock? (no need for serialized
1634       // parallel out of teams construct). This code moved here from
1635       // __kmp_reserve_threads() to speedup nested serialized parallels.
1636       if (nthreads > 1) {
1637         if ((!get__nested(master_th) && (root->r.r_in_parallel
1638 #if OMP_40_ENABLED
1639                                          && !enter_teams
1640 #endif /* OMP_40_ENABLED */
1641                                          )) ||
1642             (__kmp_library == library_serial)) {
1643           KC_TRACE(
1644               10,
1645               ("__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1646                gtid, nthreads));
1647           nthreads = 1;
1648         }
1649       }
1650       if (nthreads > 1) {
1651         /* determine how many new threads we can use */
1652         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1653 
1654         nthreads = __kmp_reserve_threads(
1655             root, parent_team, master_tid, nthreads
1656 #if OMP_40_ENABLED
1657             /* AC: If we execute teams from parallel region (on host), then
1658                teams should be created but each can only have 1 thread if
1659                nesting is disabled. If teams called from serial region, then
1660                teams and their threads should be created regardless of the
1661                nesting setting. */
1662             ,
1663             enter_teams
1664 #endif /* OMP_40_ENABLED */
1665             );
1666         if (nthreads == 1) {
1667           // Free lock for single thread execution here; for multi-thread
1668           // execution it will be freed later after team of threads created
1669           // and initialized
1670           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1671         }
1672       }
1673     }
1674     KMP_DEBUG_ASSERT(nthreads > 0);
1675 
1676     // If we temporarily changed the set number of threads then restore it now
1677     master_th->th.th_set_nproc = 0;
1678 
1679     /* create a serialized parallel region? */
1680     if (nthreads == 1) {
1681 /* josh todo: hypothetical question: what do we do for OS X*? */
1682 #if KMP_OS_LINUX &&                                                            \
1683     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1684       void *args[argc];
1685 #else
1686       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1687 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1688           KMP_ARCH_AARCH64) */
1689 
1690       KA_TRACE(20,
1691                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1692 
1693       __kmpc_serialized_parallel(loc, gtid);
1694 
1695       if (call_context == fork_context_intel) {
1696         /* TODO this sucks, use the compiler itself to pass args! :) */
1697         master_th->th.th_serial_team->t.t_ident = loc;
1698 #if OMP_40_ENABLED
1699         if (!ap) {
1700           // revert change made in __kmpc_serialized_parallel()
1701           master_th->th.th_serial_team->t.t_level--;
1702 // Get args from parent team for teams construct
1703 
1704 #if OMPT_SUPPORT
1705           void *dummy;
1706           void **exit_runtime_p;
1707 
1708           ompt_lw_taskteam_t lw_taskteam;
1709 
1710           if (ompt_enabled) {
1711             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1712                                     unwrapped_task, ompt_parallel_id);
1713             lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1714             exit_runtime_p =
1715                 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1716 
1717             __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1718 
1719 #if OMPT_TRACE
1720             my_task_id = lw_taskteam.ompt_task_info.task_id;
1721             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1722               ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1723                   ompt_parallel_id, my_task_id);
1724             }
1725 #endif
1726 
1727             /* OMPT state */
1728             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1729           } else {
1730             exit_runtime_p = &dummy;
1731           }
1732 #endif
1733 
1734           {
1735             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1736             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1737             __kmp_invoke_microtask(microtask, gtid, 0, argc,
1738                                    parent_team->t.t_argv
1739 #if OMPT_SUPPORT
1740                                    ,
1741                                    exit_runtime_p
1742 #endif
1743                                    );
1744           }
1745 
1746 #if OMPT_SUPPORT
1747           *exit_runtime_p = NULL;
1748           if (ompt_enabled) {
1749             lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1750 
1751 #if OMPT_TRACE
1752             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1753               ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1754                   ompt_parallel_id, ompt_task_id);
1755             }
1756 #endif
1757 
1758             __ompt_lw_taskteam_unlink(master_th);
1759             // reset clear the task id only after unlinking the task
1760             lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1761 
1762             if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1763               ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1764                   ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1765             }
1766             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1767           }
1768 #endif
1769         } else if (microtask == (microtask_t)__kmp_teams_master) {
1770           KMP_DEBUG_ASSERT(master_th->th.th_team ==
1771                            master_th->th.th_serial_team);
1772           team = master_th->th.th_team;
1773           // team->t.t_pkfn = microtask;
1774           team->t.t_invoke = invoker;
1775           __kmp_alloc_argv_entries(argc, team, TRUE);
1776           team->t.t_argc = argc;
1777           argv = (void **)team->t.t_argv;
1778           if (ap) {
1779             for (i = argc - 1; i >= 0; --i)
1780 // TODO: revert workaround for Intel(R) 64 tracker #96
1781 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1782               *argv++ = va_arg(*ap, void *);
1783 #else
1784               *argv++ = va_arg(ap, void *);
1785 #endif
1786           } else {
1787             for (i = 0; i < argc; ++i)
1788               // Get args from parent team for teams construct
1789               argv[i] = parent_team->t.t_argv[i];
1790           }
1791           // AC: revert change made in __kmpc_serialized_parallel()
1792           //     because initial code in teams should have level=0
1793           team->t.t_level--;
1794           // AC: call special invoker for outer "parallel" of teams construct
1795           {
1796             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1797             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1798             invoker(gtid);
1799           }
1800         } else {
1801 #endif /* OMP_40_ENABLED */
1802           argv = args;
1803           for (i = argc - 1; i >= 0; --i)
1804 // TODO: revert workaround for Intel(R) 64 tracker #96
1805 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1806             *argv++ = va_arg(*ap, void *);
1807 #else
1808           *argv++ = va_arg(ap, void *);
1809 #endif
1810           KMP_MB();
1811 
1812 #if OMPT_SUPPORT
1813           void *dummy;
1814           void **exit_runtime_p;
1815 
1816           ompt_lw_taskteam_t lw_taskteam;
1817 
1818           if (ompt_enabled) {
1819             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1820                                     unwrapped_task, ompt_parallel_id);
1821             lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1822             exit_runtime_p =
1823                 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1824 
1825             __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1826 
1827 #if OMPT_TRACE
1828             /* OMPT implicit task begin */
1829             my_task_id = lw_taskteam.ompt_task_info.task_id;
1830             my_parallel_id = ompt_parallel_id;
1831             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1832               ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1833                   my_parallel_id, my_task_id);
1834             }
1835 #endif
1836 
1837             /* OMPT state */
1838             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1839           } else {
1840             exit_runtime_p = &dummy;
1841           }
1842 #endif
1843 
1844           {
1845             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1846             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1847             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1848 #if OMPT_SUPPORT
1849                                    ,
1850                                    exit_runtime_p
1851 #endif
1852                                    );
1853           }
1854 
1855 #if OMPT_SUPPORT
1856           *exit_runtime_p = NULL;
1857           if (ompt_enabled) {
1858 #if OMPT_TRACE
1859             lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1860 
1861             if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1862               ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1863                   my_parallel_id, my_task_id);
1864             }
1865 #endif
1866 
1867             __ompt_lw_taskteam_unlink(master_th);
1868             // reset clear the task id only after unlinking the task
1869             lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1870 
1871             if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1872               ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1873                   ompt_parallel_id, ompt_task_id, OMPT_INVOKER(call_context));
1874             }
1875             master_th->th.ompt_thread_info.state = ompt_state_overhead;
1876           }
1877 #endif
1878 #if OMP_40_ENABLED
1879         }
1880 #endif /* OMP_40_ENABLED */
1881       } else if (call_context == fork_context_gnu) {
1882 #if OMPT_SUPPORT
1883         ompt_lw_taskteam_t *lwt =
1884             (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
1885         __ompt_lw_taskteam_init(lwt, master_th, gtid, unwrapped_task,
1886                                 ompt_parallel_id);
1887 
1888         lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1889         lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1890         __ompt_lw_taskteam_link(lwt, master_th);
1891 #endif
1892 
1893         // we were called from GNU native code
1894         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1895         return FALSE;
1896       }
1897       else {
1898         KMP_ASSERT2(call_context < fork_context_last,
1899                     "__kmp_fork_call: unknown fork_context parameter");
1900       }
1901 
1902       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1903       KMP_MB();
1904       return FALSE;
1905     }
1906 
1907     // GEH: only modify the executing flag in the case when not serialized
1908     //      serialized case is handled in kmpc_serialized_parallel
1909     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1910                   "curtask=%p, curtask_max_aclevel=%d\n",
1911                   parent_team->t.t_active_level, master_th,
1912                   master_th->th.th_current_task,
1913                   master_th->th.th_current_task->td_icvs.max_active_levels));
1914     // TODO: GEH - cannot do this assertion because root thread not set up as
1915     // executing
1916     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1917     master_th->th.th_current_task->td_flags.executing = 0;
1918 
1919 #if OMP_40_ENABLED
1920     if (!master_th->th.th_teams_microtask || level > teams_level)
1921 #endif /* OMP_40_ENABLED */
1922     {
1923       /* Increment our nested depth level */
1924       KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
1925     }
1926 
1927     // See if we need to make a copy of the ICVs.
1928     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1929     if ((level + 1 < __kmp_nested_nth.used) &&
1930         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1931       nthreads_icv = __kmp_nested_nth.nth[level + 1];
1932     } else {
1933       nthreads_icv = 0; // don't update
1934     }
1935 
1936 #if OMP_40_ENABLED
1937     // Figure out the proc_bind_policy for the new team.
1938     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1939     kmp_proc_bind_t proc_bind_icv =
1940         proc_bind_default; // proc_bind_default means don't update
1941     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1942       proc_bind = proc_bind_false;
1943     } else {
1944       if (proc_bind == proc_bind_default) {
1945         // No proc_bind clause specified; use current proc-bind-var for this
1946         // parallel region
1947         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1948       }
1949       /* else: The proc_bind policy was specified explicitly on parallel clause.
1950          This overrides proc-bind-var for this parallel region, but does not
1951          change proc-bind-var. */
1952       // Figure the value of proc-bind-var for the child threads.
1953       if ((level + 1 < __kmp_nested_proc_bind.used) &&
1954           (__kmp_nested_proc_bind.bind_types[level + 1] !=
1955            master_th->th.th_current_task->td_icvs.proc_bind)) {
1956         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1957       }
1958     }
1959 
1960     // Reset for next parallel region
1961     master_th->th.th_set_proc_bind = proc_bind_default;
1962 #endif /* OMP_40_ENABLED */
1963 
1964     if ((nthreads_icv > 0)
1965 #if OMP_40_ENABLED
1966         || (proc_bind_icv != proc_bind_default)
1967 #endif /* OMP_40_ENABLED */
1968             ) {
1969       kmp_internal_control_t new_icvs;
1970       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1971       new_icvs.next = NULL;
1972       if (nthreads_icv > 0) {
1973         new_icvs.nproc = nthreads_icv;
1974       }
1975 
1976 #if OMP_40_ENABLED
1977       if (proc_bind_icv != proc_bind_default) {
1978         new_icvs.proc_bind = proc_bind_icv;
1979       }
1980 #endif /* OMP_40_ENABLED */
1981 
1982       /* allocate a new parallel team */
1983       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1984       team = __kmp_allocate_team(root, nthreads, nthreads,
1985 #if OMPT_SUPPORT
1986                                  ompt_parallel_id,
1987 #endif
1988 #if OMP_40_ENABLED
1989                                  proc_bind,
1990 #endif
1991                                  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
1992     } else {
1993       /* allocate a new parallel team */
1994       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1995       team = __kmp_allocate_team(root, nthreads, nthreads,
1996 #if OMPT_SUPPORT
1997                                  ompt_parallel_id,
1998 #endif
1999 #if OMP_40_ENABLED
2000                                  proc_bind,
2001 #endif
2002                                  &master_th->th.th_current_task->td_icvs,
2003                                  argc USE_NESTED_HOT_ARG(master_th));
2004     }
2005     KF_TRACE(
2006         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2007 
2008     /* setup the new team */
2009     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2010     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2011     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2012     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2013     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2014 #if OMPT_SUPPORT
2015     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2016 #endif
2017     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2018 // TODO: parent_team->t.t_level == INT_MAX ???
2019 #if OMP_40_ENABLED
2020     if (!master_th->th.th_teams_microtask || level > teams_level) {
2021 #endif /* OMP_40_ENABLED */
2022       int new_level = parent_team->t.t_level + 1;
2023       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2024       new_level = parent_team->t.t_active_level + 1;
2025       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2026 #if OMP_40_ENABLED
2027     } else {
2028       // AC: Do not increase parallel level at start of the teams construct
2029       int new_level = parent_team->t.t_level;
2030       KMP_CHECK_UPDATE(team->t.t_level, new_level);
2031       new_level = parent_team->t.t_active_level;
2032       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2033     }
2034 #endif /* OMP_40_ENABLED */
2035     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2036     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
2037         team->t.t_sched.chunk != new_sched.chunk)
2038       team->t.t_sched =
2039           new_sched; // set master's schedule as new run-time schedule
2040 
2041 #if OMP_40_ENABLED
2042     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2043 #endif
2044 
2045     // Update the floating point rounding in the team if required.
2046     propagateFPControl(team);
2047 
2048     if (__kmp_tasking_mode != tskm_immediate_exec) {
2049       // Set master's task team to team's task team. Unless this is hot team, it
2050       // should be NULL.
2051 #if 0
2052       // Patch out an assertion that trips while the runtime seems to operate
2053       // correctly. Avoiding the preconditions that cause the assertion to trip
2054       // has been promised as a forthcoming patch.
2055       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2056                        parent_team->t.t_task_team[master_th->th.th_task_state]);
2057 #endif
2058       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2059                     "%p, new task_team %p / team %p\n",
2060                     __kmp_gtid_from_thread(master_th),
2061                     master_th->th.th_task_team, parent_team,
2062                     team->t.t_task_team[master_th->th.th_task_state], team));
2063 
2064       if (active_level || master_th->th.th_task_team) {
2065         // Take a memo of master's task_state
2066         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2067         if (master_th->th.th_task_state_top >=
2068             master_th->th.th_task_state_stack_sz) { // increase size
2069           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2070           kmp_uint8 *old_stack, *new_stack;
2071           kmp_uint32 i;
2072           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2073           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2074             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2075           }
2076           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2077                ++i) { // zero-init rest of stack
2078             new_stack[i] = 0;
2079           }
2080           old_stack = master_th->th.th_task_state_memo_stack;
2081           master_th->th.th_task_state_memo_stack = new_stack;
2082           master_th->th.th_task_state_stack_sz = new_size;
2083           __kmp_free(old_stack);
2084         }
2085         // Store master's task_state on stack
2086         master_th->th
2087             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2088             master_th->th.th_task_state;
2089         master_th->th.th_task_state_top++;
2090 #if KMP_NESTED_HOT_TEAMS
2091         if (team ==
2092             master_th->th.th_hot_teams[active_level]
2093                 .hot_team) { // Restore master's nested state if nested hot team
2094           master_th->th.th_task_state =
2095               master_th->th
2096                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
2097         } else {
2098 #endif
2099           master_th->th.th_task_state = 0;
2100 #if KMP_NESTED_HOT_TEAMS
2101         }
2102 #endif
2103       }
2104 #if !KMP_NESTED_HOT_TEAMS
2105       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2106                        (team == root->r.r_hot_team));
2107 #endif
2108     }
2109 
2110     KA_TRACE(
2111         20,
2112         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2113          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2114          team->t.t_nproc));
2115     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2116                      (team->t.t_master_tid == 0 &&
2117                       (team->t.t_parent == root->r.r_root_team ||
2118                        team->t.t_parent->t.t_serialized)));
2119     KMP_MB();
2120 
2121     /* now, setup the arguments */
2122     argv = (void **)team->t.t_argv;
2123 #if OMP_40_ENABLED
2124     if (ap) {
2125 #endif /* OMP_40_ENABLED */
2126       for (i = argc - 1; i >= 0; --i) {
2127 // TODO: revert workaround for Intel(R) 64 tracker #96
2128 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2129         void *new_argv = va_arg(*ap, void *);
2130 #else
2131       void *new_argv = va_arg(ap, void *);
2132 #endif
2133         KMP_CHECK_UPDATE(*argv, new_argv);
2134         argv++;
2135       }
2136 #if OMP_40_ENABLED
2137     } else {
2138       for (i = 0; i < argc; ++i) {
2139         // Get args from parent team for teams construct
2140         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2141       }
2142     }
2143 #endif /* OMP_40_ENABLED */
2144 
2145     /* now actually fork the threads */
2146     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2147     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2148       root->r.r_active = TRUE;
2149 
2150     __kmp_fork_team_threads(root, team, master_th, gtid);
2151     __kmp_setup_icv_copy(team, nthreads,
2152                          &master_th->th.th_current_task->td_icvs, loc);
2153 
2154 #if OMPT_SUPPORT
2155     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2156 #endif
2157 
2158     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2159 
2160 #if USE_ITT_BUILD
2161     if (team->t.t_active_level == 1 // only report frames at level 1
2162 #if OMP_40_ENABLED
2163         && !master_th->th.th_teams_microtask // not in teams construct
2164 #endif /* OMP_40_ENABLED */
2165         ) {
2166 #if USE_ITT_NOTIFY
2167       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2168           (__kmp_forkjoin_frames_mode == 3 ||
2169            __kmp_forkjoin_frames_mode == 1)) {
2170         kmp_uint64 tmp_time = 0;
2171         if (__itt_get_timestamp_ptr)
2172           tmp_time = __itt_get_timestamp();
2173         // Internal fork - report frame begin
2174         master_th->th.th_frame_time = tmp_time;
2175         if (__kmp_forkjoin_frames_mode == 3)
2176           team->t.t_region_time = tmp_time;
2177       } else // only one notification scheme (either "submit" or
2178 // "forking/joined", not both)
2179 #endif /* USE_ITT_NOTIFY */
2180           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2181               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2182         // Mark start of "parallel" region for VTune.
2183         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2184       }
2185     }
2186 #endif /* USE_ITT_BUILD */
2187 
2188     /* now go on and do the work */
2189     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2190     KMP_MB();
2191     KF_TRACE(10,
2192              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2193               root, team, master_th, gtid));
2194 
2195 #if USE_ITT_BUILD
2196     if (__itt_stack_caller_create_ptr) {
2197       team->t.t_stack_id =
2198           __kmp_itt_stack_caller_create(); // create new stack stitching id
2199       // before entering fork barrier
2200     }
2201 #endif /* USE_ITT_BUILD */
2202 
2203 #if OMP_40_ENABLED
2204     if (ap) // AC: skip __kmp_internal_fork at teams construct, let only master
2205 // threads execute
2206 #endif /* OMP_40_ENABLED */
2207     {
2208       __kmp_internal_fork(loc, gtid, team);
2209       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2210                     "master_th=%p, gtid=%d\n",
2211                     root, team, master_th, gtid));
2212     }
2213 
2214     if (call_context == fork_context_gnu) {
2215       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2216       return TRUE;
2217     }
2218 
2219     /* Invoke microtask for MASTER thread */
2220     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2221                   team->t.t_id, team->t.t_pkfn));
2222   } // END of timer KMP_fork_call block
2223 
2224   {
2225     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2226     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2227     if (!team->t.t_invoke(gtid)) {
2228       KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2229     }
2230   }
2231   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2232                 team->t.t_id, team->t.t_pkfn));
2233   KMP_MB(); /* Flush all pending memory write invalidates.  */
2234 
2235   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2236 
2237 #if OMPT_SUPPORT
2238   if (ompt_enabled) {
2239     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2240   }
2241 #endif
2242 
2243   return TRUE;
2244 }
2245 
2246 #if OMPT_SUPPORT
2247 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2248                                             kmp_team_t *team) {
2249   // restore state outside the region
2250   thread->th.ompt_thread_info.state =
2251       ((team->t.t_serialized) ? ompt_state_work_serial
2252                               : ompt_state_work_parallel);
2253 }
2254 
2255 static inline void __kmp_join_ompt(kmp_info_t *thread, kmp_team_t *team,
2256                                    ompt_parallel_id_t parallel_id,
2257                                    fork_context_e fork_context) {
2258   ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2259   if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2260     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2261         parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2262   }
2263 
2264   task_info->frame.reenter_runtime_frame = NULL;
2265   __kmp_join_restore_state(thread, team);
2266 }
2267 #endif
2268 
2269 void __kmp_join_call(ident_t *loc, int gtid
2270 #if OMPT_SUPPORT
2271                      ,
2272                      enum fork_context_e fork_context
2273 #endif
2274 #if OMP_40_ENABLED
2275                      ,
2276                      int exit_teams
2277 #endif /* OMP_40_ENABLED */
2278                      ) {
2279   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2280   kmp_team_t *team;
2281   kmp_team_t *parent_team;
2282   kmp_info_t *master_th;
2283   kmp_root_t *root;
2284   int master_active;
2285   int i;
2286 
2287   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2288 
2289   /* setup current data */
2290   master_th = __kmp_threads[gtid];
2291   root = master_th->th.th_root;
2292   team = master_th->th.th_team;
2293   parent_team = team->t.t_parent;
2294 
2295   master_th->th.th_ident = loc;
2296 
2297 #if OMPT_SUPPORT
2298   if (ompt_enabled) {
2299     master_th->th.ompt_thread_info.state = ompt_state_overhead;
2300   }
2301 #endif
2302 
2303 #if KMP_DEBUG
2304   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2305     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2306                   "th_task_team = %p\n",
2307                   __kmp_gtid_from_thread(master_th), team,
2308                   team->t.t_task_team[master_th->th.th_task_state],
2309                   master_th->th.th_task_team));
2310     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2311                      team->t.t_task_team[master_th->th.th_task_state]);
2312   }
2313 #endif
2314 
2315   if (team->t.t_serialized) {
2316 #if OMP_40_ENABLED
2317     if (master_th->th.th_teams_microtask) {
2318       // We are in teams construct
2319       int level = team->t.t_level;
2320       int tlevel = master_th->th.th_teams_level;
2321       if (level == tlevel) {
2322         // AC: we haven't incremented it earlier at start of teams construct,
2323         //     so do it here - at the end of teams construct
2324         team->t.t_level++;
2325       } else if (level == tlevel + 1) {
2326         // AC: we are exiting parallel inside teams, need to increment
2327         // serialization in order to restore it in the next call to
2328         // __kmpc_end_serialized_parallel
2329         team->t.t_serialized++;
2330       }
2331     }
2332 #endif /* OMP_40_ENABLED */
2333     __kmpc_end_serialized_parallel(loc, gtid);
2334 
2335 #if OMPT_SUPPORT
2336     if (ompt_enabled) {
2337       __kmp_join_restore_state(master_th, parent_team);
2338     }
2339 #endif
2340 
2341     return;
2342   }
2343 
2344   master_active = team->t.t_master_active;
2345 
2346 #if OMP_40_ENABLED
2347   if (!exit_teams)
2348 #endif /* OMP_40_ENABLED */
2349   {
2350     // AC: No barrier for internal teams at exit from teams construct.
2351     //     But there is barrier for external team (league).
2352     __kmp_internal_join(loc, gtid, team);
2353   }
2354 #if OMP_40_ENABLED
2355   else {
2356     master_th->th.th_task_state =
2357         0; // AC: no tasking in teams (out of any parallel)
2358   }
2359 #endif /* OMP_40_ENABLED */
2360 
2361   KMP_MB();
2362 
2363 #if OMPT_SUPPORT
2364   ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2365 #endif
2366 
2367 #if USE_ITT_BUILD
2368   if (__itt_stack_caller_create_ptr) {
2369     __kmp_itt_stack_caller_destroy(
2370         (__itt_caller)team->t
2371             .t_stack_id); // destroy the stack stitching id after join barrier
2372   }
2373 
2374   // Mark end of "parallel" region for VTune.
2375   if (team->t.t_active_level == 1
2376 #if OMP_40_ENABLED
2377       && !master_th->th.th_teams_microtask /* not in teams construct */
2378 #endif /* OMP_40_ENABLED */
2379       ) {
2380     master_th->th.th_ident = loc;
2381     // only one notification scheme (either "submit" or "forking/joined", not
2382     // both)
2383     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2384         __kmp_forkjoin_frames_mode == 3)
2385       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2386                              master_th->th.th_frame_time, 0, loc,
2387                              master_th->th.th_team_nproc, 1);
2388     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2389              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2390       __kmp_itt_region_joined(gtid);
2391   } // active_level == 1
2392 #endif /* USE_ITT_BUILD */
2393 
2394 #if OMP_40_ENABLED
2395   if (master_th->th.th_teams_microtask && !exit_teams &&
2396       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2397       team->t.t_level == master_th->th.th_teams_level + 1) {
2398     // AC: We need to leave the team structure intact at the end of parallel
2399     // inside the teams construct, so that at the next parallel same (hot) team
2400     // works, only adjust nesting levels
2401 
2402     /* Decrement our nested depth level */
2403     team->t.t_level--;
2404     team->t.t_active_level--;
2405     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2406 
2407     /* Restore number of threads in the team if needed */
2408     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2409       int old_num = master_th->th.th_team_nproc;
2410       int new_num = master_th->th.th_teams_size.nth;
2411       kmp_info_t **other_threads = team->t.t_threads;
2412       team->t.t_nproc = new_num;
2413       for (i = 0; i < old_num; ++i) {
2414         other_threads[i]->th.th_team_nproc = new_num;
2415       }
2416       // Adjust states of non-used threads of the team
2417       for (i = old_num; i < new_num; ++i) {
2418         // Re-initialize thread's barrier data.
2419         int b;
2420         kmp_balign_t *balign = other_threads[i]->th.th_bar;
2421         for (b = 0; b < bs_last_barrier; ++b) {
2422           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2423           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2424 #if USE_DEBUGGER
2425           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2426 #endif
2427         }
2428         if (__kmp_tasking_mode != tskm_immediate_exec) {
2429           // Synchronize thread's task state
2430           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2431         }
2432       }
2433     }
2434 
2435 #if OMPT_SUPPORT
2436     if (ompt_enabled) {
2437       __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2438     }
2439 #endif
2440 
2441     return;
2442   }
2443 #endif /* OMP_40_ENABLED */
2444 
2445   /* do cleanup and restore the parent team */
2446   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2447   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2448 
2449   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2450 
2451   /* jc: The following lock has instructions with REL and ACQ semantics,
2452      separating the parallel user code called in this parallel region
2453      from the serial user code called after this function returns. */
2454   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2455 
2456 #if OMP_40_ENABLED
2457   if (!master_th->th.th_teams_microtask ||
2458       team->t.t_level > master_th->th.th_teams_level)
2459 #endif /* OMP_40_ENABLED */
2460   {
2461     /* Decrement our nested depth level */
2462     KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
2463   }
2464   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2465 
2466 #if OMPT_SUPPORT && OMPT_TRACE
2467   if (ompt_enabled) {
2468     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2469     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2470       ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2471           parallel_id, task_info->task_id);
2472     }
2473     task_info->frame.exit_runtime_frame = NULL;
2474     task_info->task_id = 0;
2475   }
2476 #endif
2477 
2478   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2479                 master_th, team));
2480   __kmp_pop_current_task_from_thread(master_th);
2481 
2482 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2483   // Restore master thread's partition.
2484   master_th->th.th_first_place = team->t.t_first_place;
2485   master_th->th.th_last_place = team->t.t_last_place;
2486 #endif /* OMP_40_ENABLED */
2487 
2488   updateHWFPControl(team);
2489 
2490   if (root->r.r_active != master_active)
2491     root->r.r_active = master_active;
2492 
2493   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2494                             master_th)); // this will free worker threads
2495 
2496   /* this race was fun to find. make sure the following is in the critical
2497      region otherwise assertions may fail occasionally since the old team may be
2498      reallocated and the hierarchy appears inconsistent. it is actually safe to
2499      run and won't cause any bugs, but will cause those assertion failures. it's
2500      only one deref&assign so might as well put this in the critical region */
2501   master_th->th.th_team = parent_team;
2502   master_th->th.th_team_nproc = parent_team->t.t_nproc;
2503   master_th->th.th_team_master = parent_team->t.t_threads[0];
2504   master_th->th.th_team_serialized = parent_team->t.t_serialized;
2505 
2506   /* restore serialized team, if need be */
2507   if (parent_team->t.t_serialized &&
2508       parent_team != master_th->th.th_serial_team &&
2509       parent_team != root->r.r_root_team) {
2510     __kmp_free_team(root,
2511                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2512     master_th->th.th_serial_team = parent_team;
2513   }
2514 
2515   if (__kmp_tasking_mode != tskm_immediate_exec) {
2516     if (master_th->th.th_task_state_top >
2517         0) { // Restore task state from memo stack
2518       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2519       // Remember master's state if we re-use this nested hot team
2520       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2521           master_th->th.th_task_state;
2522       --master_th->th.th_task_state_top; // pop
2523       // Now restore state at this level
2524       master_th->th.th_task_state =
2525           master_th->th
2526               .th_task_state_memo_stack[master_th->th.th_task_state_top];
2527     }
2528     // Copy the task team from the parent team to the master thread
2529     master_th->th.th_task_team =
2530         parent_team->t.t_task_team[master_th->th.th_task_state];
2531     KA_TRACE(20,
2532              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2533               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2534               parent_team));
2535   }
2536 
2537   // TODO: GEH - cannot do this assertion because root thread not set up as
2538   // executing
2539   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2540   master_th->th.th_current_task->td_flags.executing = 1;
2541 
2542   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2543 
2544 #if OMPT_SUPPORT
2545   if (ompt_enabled) {
2546     __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2547   }
2548 #endif
2549 
2550   KMP_MB();
2551   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2552 }
2553 
2554 /* Check whether we should push an internal control record onto the
2555    serial team stack.  If so, do it.  */
2556 void __kmp_save_internal_controls(kmp_info_t *thread) {
2557 
2558   if (thread->th.th_team != thread->th.th_serial_team) {
2559     return;
2560   }
2561   if (thread->th.th_team->t.t_serialized > 1) {
2562     int push = 0;
2563 
2564     if (thread->th.th_team->t.t_control_stack_top == NULL) {
2565       push = 1;
2566     } else {
2567       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2568           thread->th.th_team->t.t_serialized) {
2569         push = 1;
2570       }
2571     }
2572     if (push) { /* push a record on the serial team's stack */
2573       kmp_internal_control_t *control =
2574           (kmp_internal_control_t *)__kmp_allocate(
2575               sizeof(kmp_internal_control_t));
2576 
2577       copy_icvs(control, &thread->th.th_current_task->td_icvs);
2578 
2579       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2580 
2581       control->next = thread->th.th_team->t.t_control_stack_top;
2582       thread->th.th_team->t.t_control_stack_top = control;
2583     }
2584   }
2585 }
2586 
2587 /* Changes set_nproc */
2588 void __kmp_set_num_threads(int new_nth, int gtid) {
2589   kmp_info_t *thread;
2590   kmp_root_t *root;
2591 
2592   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2593   KMP_DEBUG_ASSERT(__kmp_init_serial);
2594 
2595   if (new_nth < 1)
2596     new_nth = 1;
2597   else if (new_nth > __kmp_max_nth)
2598     new_nth = __kmp_max_nth;
2599 
2600   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2601   thread = __kmp_threads[gtid];
2602 
2603   __kmp_save_internal_controls(thread);
2604 
2605   set__nproc(thread, new_nth);
2606 
2607   // If this omp_set_num_threads() call will cause the hot team size to be
2608   // reduced (in the absence of a num_threads clause), then reduce it now,
2609   // rather than waiting for the next parallel region.
2610   root = thread->th.th_root;
2611   if (__kmp_init_parallel && (!root->r.r_active) &&
2612       (root->r.r_hot_team->t.t_nproc > new_nth)
2613 #if KMP_NESTED_HOT_TEAMS
2614       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2615 #endif
2616       ) {
2617     kmp_team_t *hot_team = root->r.r_hot_team;
2618     int f;
2619 
2620     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2621 
2622     // Release the extra threads we don't need any more.
2623     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2624       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2625       if (__kmp_tasking_mode != tskm_immediate_exec) {
2626         // When decreasing team size, threads no longer in the team should unref
2627         // task team.
2628         hot_team->t.t_threads[f]->th.th_task_team = NULL;
2629       }
2630       __kmp_free_thread(hot_team->t.t_threads[f]);
2631       hot_team->t.t_threads[f] = NULL;
2632     }
2633     hot_team->t.t_nproc = new_nth;
2634 #if KMP_NESTED_HOT_TEAMS
2635     if (thread->th.th_hot_teams) {
2636       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2637       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2638     }
2639 #endif
2640 
2641     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2642 
2643     // Update the t_nproc field in the threads that are still active.
2644     for (f = 0; f < new_nth; f++) {
2645       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2646       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2647     }
2648     // Special flag in case omp_set_num_threads() call
2649     hot_team->t.t_size_changed = -1;
2650   }
2651 }
2652 
2653 /* Changes max_active_levels */
2654 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2655   kmp_info_t *thread;
2656 
2657   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2658                 "%d = (%d)\n",
2659                 gtid, max_active_levels));
2660   KMP_DEBUG_ASSERT(__kmp_init_serial);
2661 
2662   // validate max_active_levels
2663   if (max_active_levels < 0) {
2664     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2665     // We ignore this call if the user has specified a negative value.
2666     // The current setting won't be changed. The last valid setting will be
2667     // used. A warning will be issued (if warnings are allowed as controlled by
2668     // the KMP_WARNINGS env var).
2669     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2670                   "max_active_levels for thread %d = (%d)\n",
2671                   gtid, max_active_levels));
2672     return;
2673   }
2674   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2675     // it's OK, the max_active_levels is within the valid range: [ 0;
2676     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2677     // We allow a zero value. (implementation defined behavior)
2678   } else {
2679     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2680                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2681     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2682     // Current upper limit is MAX_INT. (implementation defined behavior)
2683     // If the input exceeds the upper limit, we correct the input to be the
2684     // upper limit. (implementation defined behavior)
2685     // Actually, the flow should never get here until we use MAX_INT limit.
2686   }
2687   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2688                 "max_active_levels for thread %d = (%d)\n",
2689                 gtid, max_active_levels));
2690 
2691   thread = __kmp_threads[gtid];
2692 
2693   __kmp_save_internal_controls(thread);
2694 
2695   set__max_active_levels(thread, max_active_levels);
2696 }
2697 
2698 /* Gets max_active_levels */
2699 int __kmp_get_max_active_levels(int gtid) {
2700   kmp_info_t *thread;
2701 
2702   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2703   KMP_DEBUG_ASSERT(__kmp_init_serial);
2704 
2705   thread = __kmp_threads[gtid];
2706   KMP_DEBUG_ASSERT(thread->th.th_current_task);
2707   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2708                 "curtask_maxaclevel=%d\n",
2709                 gtid, thread->th.th_current_task,
2710                 thread->th.th_current_task->td_icvs.max_active_levels));
2711   return thread->th.th_current_task->td_icvs.max_active_levels;
2712 }
2713 
2714 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2715 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2716   kmp_info_t *thread;
2717   //    kmp_team_t *team;
2718 
2719   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2720                 gtid, (int)kind, chunk));
2721   KMP_DEBUG_ASSERT(__kmp_init_serial);
2722 
2723   // Check if the kind parameter is valid, correct if needed.
2724   // Valid parameters should fit in one of two intervals - standard or extended:
2725   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2726   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2727   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2728       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2729     // TODO: Hint needs attention in case we change the default schedule.
2730     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2731               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2732               __kmp_msg_null);
2733     kind = kmp_sched_default;
2734     chunk = 0; // ignore chunk value in case of bad kind
2735   }
2736 
2737   thread = __kmp_threads[gtid];
2738 
2739   __kmp_save_internal_controls(thread);
2740 
2741   if (kind < kmp_sched_upper_std) {
2742     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2743       // differ static chunked vs. unchunked:  chunk should be invalid to
2744       // indicate unchunked schedule (which is the default)
2745       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2746     } else {
2747       thread->th.th_current_task->td_icvs.sched.r_sched_type =
2748           __kmp_sch_map[kind - kmp_sched_lower - 1];
2749     }
2750   } else {
2751     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2752     //    kmp_sched_lower - 2 ];
2753     thread->th.th_current_task->td_icvs.sched.r_sched_type =
2754         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2755                       kmp_sched_lower - 2];
2756   }
2757   if (kind == kmp_sched_auto) {
2758     // ignore parameter chunk for schedule auto
2759     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2760   } else {
2761     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2762   }
2763 }
2764 
2765 /* Gets def_sched_var ICV values */
2766 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2767   kmp_info_t *thread;
2768   enum sched_type th_type;
2769 
2770   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2771   KMP_DEBUG_ASSERT(__kmp_init_serial);
2772 
2773   thread = __kmp_threads[gtid];
2774 
2775   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2776 
2777   switch (th_type) {
2778   case kmp_sch_static:
2779   case kmp_sch_static_greedy:
2780   case kmp_sch_static_balanced:
2781     *kind = kmp_sched_static;
2782     *chunk = 0; // chunk was not set, try to show this fact via zero value
2783     return;
2784   case kmp_sch_static_chunked:
2785     *kind = kmp_sched_static;
2786     break;
2787   case kmp_sch_dynamic_chunked:
2788     *kind = kmp_sched_dynamic;
2789     break;
2790   case kmp_sch_guided_chunked:
2791   case kmp_sch_guided_iterative_chunked:
2792   case kmp_sch_guided_analytical_chunked:
2793     *kind = kmp_sched_guided;
2794     break;
2795   case kmp_sch_auto:
2796     *kind = kmp_sched_auto;
2797     break;
2798   case kmp_sch_trapezoidal:
2799     *kind = kmp_sched_trapezoidal;
2800     break;
2801 #if KMP_STATIC_STEAL_ENABLED
2802   case kmp_sch_static_steal:
2803     *kind = kmp_sched_static_steal;
2804     break;
2805 #endif
2806   default:
2807     KMP_FATAL(UnknownSchedulingType, th_type);
2808   }
2809 
2810   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2811 }
2812 
2813 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2814 
2815   int ii, dd;
2816   kmp_team_t *team;
2817   kmp_info_t *thr;
2818 
2819   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2820   KMP_DEBUG_ASSERT(__kmp_init_serial);
2821 
2822   // validate level
2823   if (level == 0)
2824     return 0;
2825   if (level < 0)
2826     return -1;
2827   thr = __kmp_threads[gtid];
2828   team = thr->th.th_team;
2829   ii = team->t.t_level;
2830   if (level > ii)
2831     return -1;
2832 
2833 #if OMP_40_ENABLED
2834   if (thr->th.th_teams_microtask) {
2835     // AC: we are in teams region where multiple nested teams have same level
2836     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2837     if (level <=
2838         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2839       KMP_DEBUG_ASSERT(ii >= tlevel);
2840       // AC: As we need to pass by the teams league, we need to artificially
2841       // increase ii
2842       if (ii == tlevel) {
2843         ii += 2; // three teams have same level
2844       } else {
2845         ii++; // two teams have same level
2846       }
2847     }
2848   }
2849 #endif
2850 
2851   if (ii == level)
2852     return __kmp_tid_from_gtid(gtid);
2853 
2854   dd = team->t.t_serialized;
2855   level++;
2856   while (ii > level) {
2857     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2858     }
2859     if ((team->t.t_serialized) && (!dd)) {
2860       team = team->t.t_parent;
2861       continue;
2862     }
2863     if (ii > level) {
2864       team = team->t.t_parent;
2865       dd = team->t.t_serialized;
2866       ii--;
2867     }
2868   }
2869 
2870   return (dd > 1) ? (0) : (team->t.t_master_tid);
2871 }
2872 
2873 int __kmp_get_team_size(int gtid, int level) {
2874 
2875   int ii, dd;
2876   kmp_team_t *team;
2877   kmp_info_t *thr;
2878 
2879   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2880   KMP_DEBUG_ASSERT(__kmp_init_serial);
2881 
2882   // validate level
2883   if (level == 0)
2884     return 1;
2885   if (level < 0)
2886     return -1;
2887   thr = __kmp_threads[gtid];
2888   team = thr->th.th_team;
2889   ii = team->t.t_level;
2890   if (level > ii)
2891     return -1;
2892 
2893 #if OMP_40_ENABLED
2894   if (thr->th.th_teams_microtask) {
2895     // AC: we are in teams region where multiple nested teams have same level
2896     int tlevel = thr->th.th_teams_level; // the level of the teams construct
2897     if (level <=
2898         tlevel) { // otherwise usual algorithm works (will not touch the teams)
2899       KMP_DEBUG_ASSERT(ii >= tlevel);
2900       // AC: As we need to pass by the teams league, we need to artificially
2901       // increase ii
2902       if (ii == tlevel) {
2903         ii += 2; // three teams have same level
2904       } else {
2905         ii++; // two teams have same level
2906       }
2907     }
2908   }
2909 #endif
2910 
2911   while (ii > level) {
2912     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2913     }
2914     if (team->t.t_serialized && (!dd)) {
2915       team = team->t.t_parent;
2916       continue;
2917     }
2918     if (ii > level) {
2919       team = team->t.t_parent;
2920       ii--;
2921     }
2922   }
2923 
2924   return team->t.t_nproc;
2925 }
2926 
2927 kmp_r_sched_t __kmp_get_schedule_global() {
2928   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2929   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2930   // independently. So one can get the updated schedule here.
2931 
2932   kmp_r_sched_t r_sched;
2933 
2934   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2935   // __kmp_guided. __kmp_sched should keep original value, so that user can set
2936   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2937   // different roots (even in OMP 2.5)
2938   if (__kmp_sched == kmp_sch_static) {
2939     r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed
2940     // schedule (balanced or greedy)
2941   } else if (__kmp_sched == kmp_sch_guided_chunked) {
2942     r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed
2943     // schedule (iterative or analytical)
2944   } else {
2945     r_sched.r_sched_type =
2946         __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2947   }
2948 
2949   if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it
2950     // was not ever set)
2951     r_sched.chunk = KMP_DEFAULT_CHUNK;
2952   } else {
2953     r_sched.chunk = __kmp_chunk;
2954   }
2955 
2956   return r_sched;
2957 }
2958 
2959 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2960    at least argc number of *t_argv entries for the requested team. */
2961 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2962 
2963   KMP_DEBUG_ASSERT(team);
2964   if (!realloc || argc > team->t.t_max_argc) {
2965 
2966     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2967                    "current entries=%d\n",
2968                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2969     /* if previously allocated heap space for args, free them */
2970     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2971       __kmp_free((void *)team->t.t_argv);
2972 
2973     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2974       /* use unused space in the cache line for arguments */
2975       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2976       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2977                      "argv entries\n",
2978                      team->t.t_id, team->t.t_max_argc));
2979       team->t.t_argv = &team->t.t_inline_argv[0];
2980       if (__kmp_storage_map) {
2981         __kmp_print_storage_map_gtid(
2982             -1, &team->t.t_inline_argv[0],
2983             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2984             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2985             team->t.t_id);
2986       }
2987     } else {
2988       /* allocate space for arguments in the heap */
2989       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2990                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
2991                                : 2 * argc;
2992       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2993                      "argv entries\n",
2994                      team->t.t_id, team->t.t_max_argc));
2995       team->t.t_argv =
2996           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2997       if (__kmp_storage_map) {
2998         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2999                                      &team->t.t_argv[team->t.t_max_argc],
3000                                      sizeof(void *) * team->t.t_max_argc,
3001                                      "team_%d.t_argv", team->t.t_id);
3002       }
3003     }
3004   }
3005 }
3006 
3007 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3008   int i;
3009   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3010   team->t.t_threads =
3011       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3012   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3013       sizeof(dispatch_shared_info_t) * num_disp_buff);
3014   team->t.t_dispatch =
3015       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3016   team->t.t_implicit_task_taskdata =
3017       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3018   team->t.t_max_nproc = max_nth;
3019 
3020   /* setup dispatch buffers */
3021   for (i = 0; i < num_disp_buff; ++i) {
3022     team->t.t_disp_buffer[i].buffer_index = i;
3023 #if OMP_45_ENABLED
3024     team->t.t_disp_buffer[i].doacross_buf_idx = i;
3025 #endif
3026   }
3027 }
3028 
3029 static void __kmp_free_team_arrays(kmp_team_t *team) {
3030   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3031   int i;
3032   for (i = 0; i < team->t.t_max_nproc; ++i) {
3033     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3034       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3035       team->t.t_dispatch[i].th_disp_buffer = NULL;
3036     }; // if
3037   }; // for
3038   __kmp_free(team->t.t_threads);
3039   __kmp_free(team->t.t_disp_buffer);
3040   __kmp_free(team->t.t_dispatch);
3041   __kmp_free(team->t.t_implicit_task_taskdata);
3042   team->t.t_threads = NULL;
3043   team->t.t_disp_buffer = NULL;
3044   team->t.t_dispatch = NULL;
3045   team->t.t_implicit_task_taskdata = 0;
3046 }
3047 
3048 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3049   kmp_info_t **oldThreads = team->t.t_threads;
3050 
3051   __kmp_free(team->t.t_disp_buffer);
3052   __kmp_free(team->t.t_dispatch);
3053   __kmp_free(team->t.t_implicit_task_taskdata);
3054   __kmp_allocate_team_arrays(team, max_nth);
3055 
3056   KMP_MEMCPY(team->t.t_threads, oldThreads,
3057              team->t.t_nproc * sizeof(kmp_info_t *));
3058 
3059   __kmp_free(oldThreads);
3060 }
3061 
3062 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3063 
3064   kmp_r_sched_t r_sched =
3065       __kmp_get_schedule_global(); // get current state of scheduling globals
3066 
3067 #if OMP_40_ENABLED
3068   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3069 #endif /* OMP_40_ENABLED */
3070 
3071   kmp_internal_control_t g_icvs = {
3072     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3073     (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3074     // for nested parallelism (per thread)
3075     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3076     // adjustment of threads (per thread)
3077     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3078     // whether blocktime is explicitly set
3079     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3080 #if KMP_USE_MONITOR
3081     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3082 // intervals
3083 #endif
3084     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3085     // next parallel region (per thread)
3086     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3087     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3088     // for max_active_levels
3089     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3090 // {sched,chunk} pair
3091 #if OMP_40_ENABLED
3092     __kmp_nested_proc_bind.bind_types[0],
3093     __kmp_default_device,
3094 #endif /* OMP_40_ENABLED */
3095     NULL // struct kmp_internal_control *next;
3096   };
3097 
3098   return g_icvs;
3099 }
3100 
3101 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3102 
3103   kmp_internal_control_t gx_icvs;
3104   gx_icvs.serial_nesting_level =
3105       0; // probably =team->t.t_serial like in save_inter_controls
3106   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3107   gx_icvs.next = NULL;
3108 
3109   return gx_icvs;
3110 }
3111 
3112 static void __kmp_initialize_root(kmp_root_t *root) {
3113   int f;
3114   kmp_team_t *root_team;
3115   kmp_team_t *hot_team;
3116   int hot_team_max_nth;
3117   kmp_r_sched_t r_sched =
3118       __kmp_get_schedule_global(); // get current state of scheduling globals
3119   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3120   KMP_DEBUG_ASSERT(root);
3121   KMP_ASSERT(!root->r.r_begin);
3122 
3123   /* setup the root state structure */
3124   __kmp_init_lock(&root->r.r_begin_lock);
3125   root->r.r_begin = FALSE;
3126   root->r.r_active = FALSE;
3127   root->r.r_in_parallel = 0;
3128   root->r.r_blocktime = __kmp_dflt_blocktime;
3129   root->r.r_nested = __kmp_dflt_nested;
3130 
3131   /* setup the root team for this task */
3132   /* allocate the root team structure */
3133   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3134 
3135   root_team =
3136       __kmp_allocate_team(root,
3137                           1, // new_nproc
3138                           1, // max_nproc
3139 #if OMPT_SUPPORT
3140                           0, // root parallel id
3141 #endif
3142 #if OMP_40_ENABLED
3143                           __kmp_nested_proc_bind.bind_types[0],
3144 #endif
3145                           &r_icvs,
3146                           0 // argc
3147                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3148                           );
3149 #if USE_DEBUGGER
3150   // Non-NULL value should be assigned to make the debugger display the root
3151   // team.
3152   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3153 #endif
3154 
3155   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3156 
3157   root->r.r_root_team = root_team;
3158   root_team->t.t_control_stack_top = NULL;
3159 
3160   /* initialize root team */
3161   root_team->t.t_threads[0] = NULL;
3162   root_team->t.t_nproc = 1;
3163   root_team->t.t_serialized = 1;
3164   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3165   root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3166   root_team->t.t_sched.chunk = r_sched.chunk;
3167   KA_TRACE(
3168       20,
3169       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3170        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3171 
3172   /* setup the  hot team for this task */
3173   /* allocate the hot team structure */
3174   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3175 
3176   hot_team =
3177       __kmp_allocate_team(root,
3178                           1, // new_nproc
3179                           __kmp_dflt_team_nth_ub * 2, // max_nproc
3180 #if OMPT_SUPPORT
3181                           0, // root parallel id
3182 #endif
3183 #if OMP_40_ENABLED
3184                           __kmp_nested_proc_bind.bind_types[0],
3185 #endif
3186                           &r_icvs,
3187                           0 // argc
3188                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3189                           );
3190   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3191 
3192   root->r.r_hot_team = hot_team;
3193   root_team->t.t_control_stack_top = NULL;
3194 
3195   /* first-time initialization */
3196   hot_team->t.t_parent = root_team;
3197 
3198   /* initialize hot team */
3199   hot_team_max_nth = hot_team->t.t_max_nproc;
3200   for (f = 0; f < hot_team_max_nth; ++f) {
3201     hot_team->t.t_threads[f] = NULL;
3202   }; // for
3203   hot_team->t.t_nproc = 1;
3204   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3205   hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3206   hot_team->t.t_sched.chunk = r_sched.chunk;
3207   hot_team->t.t_size_changed = 0;
3208 }
3209 
3210 #ifdef KMP_DEBUG
3211 
3212 typedef struct kmp_team_list_item {
3213   kmp_team_p const *entry;
3214   struct kmp_team_list_item *next;
3215 } kmp_team_list_item_t;
3216 typedef kmp_team_list_item_t *kmp_team_list_t;
3217 
3218 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3219     kmp_team_list_t list, // List of teams.
3220     kmp_team_p const *team // Team to add.
3221     ) {
3222 
3223   // List must terminate with item where both entry and next are NULL.
3224   // Team is added to the list only once.
3225   // List is sorted in ascending order by team id.
3226   // Team id is *not* a key.
3227 
3228   kmp_team_list_t l;
3229 
3230   KMP_DEBUG_ASSERT(list != NULL);
3231   if (team == NULL) {
3232     return;
3233   }; // if
3234 
3235   __kmp_print_structure_team_accum(list, team->t.t_parent);
3236   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3237 
3238   // Search list for the team.
3239   l = list;
3240   while (l->next != NULL && l->entry != team) {
3241     l = l->next;
3242   }; // while
3243   if (l->next != NULL) {
3244     return; // Team has been added before, exit.
3245   }; // if
3246 
3247   // Team is not found. Search list again for insertion point.
3248   l = list;
3249   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3250     l = l->next;
3251   }; // while
3252 
3253   // Insert team.
3254   {
3255     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3256         sizeof(kmp_team_list_item_t));
3257     *item = *l;
3258     l->entry = team;
3259     l->next = item;
3260   }
3261 }
3262 
3263 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3264 
3265                                        ) {
3266   __kmp_printf("%s", title);
3267   if (team != NULL) {
3268     __kmp_printf("%2x %p\n", team->t.t_id, team);
3269   } else {
3270     __kmp_printf(" - (nil)\n");
3271   }; // if
3272 }
3273 
3274 static void __kmp_print_structure_thread(char const *title,
3275                                          kmp_info_p const *thread) {
3276   __kmp_printf("%s", title);
3277   if (thread != NULL) {
3278     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3279   } else {
3280     __kmp_printf(" - (nil)\n");
3281   }; // if
3282 }
3283 
3284 void __kmp_print_structure(void) {
3285 
3286   kmp_team_list_t list;
3287 
3288   // Initialize list of teams.
3289   list =
3290       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3291   list->entry = NULL;
3292   list->next = NULL;
3293 
3294   __kmp_printf("\n------------------------------\nGlobal Thread "
3295                "Table\n------------------------------\n");
3296   {
3297     int gtid;
3298     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3299       __kmp_printf("%2d", gtid);
3300       if (__kmp_threads != NULL) {
3301         __kmp_printf(" %p", __kmp_threads[gtid]);
3302       }; // if
3303       if (__kmp_root != NULL) {
3304         __kmp_printf(" %p", __kmp_root[gtid]);
3305       }; // if
3306       __kmp_printf("\n");
3307     }; // for gtid
3308   }
3309 
3310   // Print out __kmp_threads array.
3311   __kmp_printf("\n------------------------------\nThreads\n--------------------"
3312                "----------\n");
3313   if (__kmp_threads != NULL) {
3314     int gtid;
3315     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3316       kmp_info_t const *thread = __kmp_threads[gtid];
3317       if (thread != NULL) {
3318         __kmp_printf("GTID %2d %p:\n", gtid, thread);
3319         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3320         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3321         __kmp_print_structure_team("    Serial Team:  ",
3322                                    thread->th.th_serial_team);
3323         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3324         __kmp_print_structure_thread("    Master:       ",
3325                                      thread->th.th_team_master);
3326         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3327         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3328 #if OMP_40_ENABLED
3329         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3330 #endif
3331         __kmp_print_structure_thread("    Next in pool: ",
3332                                      thread->th.th_next_pool);
3333         __kmp_printf("\n");
3334         __kmp_print_structure_team_accum(list, thread->th.th_team);
3335         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3336       }; // if
3337     }; // for gtid
3338   } else {
3339     __kmp_printf("Threads array is not allocated.\n");
3340   }; // if
3341 
3342   // Print out __kmp_root array.
3343   __kmp_printf("\n------------------------------\nUbers\n----------------------"
3344                "--------\n");
3345   if (__kmp_root != NULL) {
3346     int gtid;
3347     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348       kmp_root_t const *root = __kmp_root[gtid];
3349       if (root != NULL) {
3350         __kmp_printf("GTID %2d %p:\n", gtid, root);
3351         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3352         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3353         __kmp_print_structure_thread("    Uber Thread:  ",
3354                                      root->r.r_uber_thread);
3355         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3356         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
3357         __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
3358         __kmp_printf("\n");
3359         __kmp_print_structure_team_accum(list, root->r.r_root_team);
3360         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3361       }; // if
3362     }; // for gtid
3363   } else {
3364     __kmp_printf("Ubers array is not allocated.\n");
3365   }; // if
3366 
3367   __kmp_printf("\n------------------------------\nTeams\n----------------------"
3368                "--------\n");
3369   while (list->next != NULL) {
3370     kmp_team_p const *team = list->entry;
3371     int i;
3372     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3373     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3374     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3375     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3376     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3377     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3378     for (i = 0; i < team->t.t_nproc; ++i) {
3379       __kmp_printf("    Thread %2d:      ", i);
3380       __kmp_print_structure_thread("", team->t.t_threads[i]);
3381     }; // for i
3382     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3383     __kmp_printf("\n");
3384     list = list->next;
3385   }; // while
3386 
3387   // Print out __kmp_thread_pool and __kmp_team_pool.
3388   __kmp_printf("\n------------------------------\nPools\n----------------------"
3389                "--------\n");
3390   __kmp_print_structure_thread("Thread pool:          ",
3391                                (kmp_info_t *)__kmp_thread_pool);
3392   __kmp_print_structure_team("Team pool:            ",
3393                              (kmp_team_t *)__kmp_team_pool);
3394   __kmp_printf("\n");
3395 
3396   // Free team list.
3397   while (list != NULL) {
3398     kmp_team_list_item_t *item = list;
3399     list = list->next;
3400     KMP_INTERNAL_FREE(item);
3401   }; // while
3402 }
3403 
3404 #endif
3405 
3406 //---------------------------------------------------------------------------
3407 //  Stuff for per-thread fast random number generator
3408 //  Table of primes
3409 static const unsigned __kmp_primes[] = {
3410     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3411     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3412     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3413     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3414     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3415     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3416     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3417     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3418     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3419     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3420     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3421 
3422 //---------------------------------------------------------------------------
3423 //  __kmp_get_random: Get a random number using a linear congruential method.
3424 unsigned short __kmp_get_random(kmp_info_t *thread) {
3425   unsigned x = thread->th.th_x;
3426   unsigned short r = x >> 16;
3427 
3428   thread->th.th_x = x * thread->th.th_a + 1;
3429 
3430   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3431                 thread->th.th_info.ds.ds_tid, r));
3432 
3433   return r;
3434 }
3435 //--------------------------------------------------------
3436 // __kmp_init_random: Initialize a random number generator
3437 void __kmp_init_random(kmp_info_t *thread) {
3438   unsigned seed = thread->th.th_info.ds.ds_tid;
3439 
3440   thread->th.th_a =
3441       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3442   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3443   KA_TRACE(30,
3444            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3445 }
3446 
3447 #if KMP_OS_WINDOWS
3448 /* reclaim array entries for root threads that are already dead, returns number
3449  * reclaimed */
3450 static int __kmp_reclaim_dead_roots(void) {
3451   int i, r = 0;
3452 
3453   for (i = 0; i < __kmp_threads_capacity; ++i) {
3454     if (KMP_UBER_GTID(i) &&
3455         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3456         !__kmp_root[i]
3457              ->r.r_active) { // AC: reclaim only roots died in non-active state
3458       r += __kmp_unregister_root_other_thread(i);
3459     }
3460   }
3461   return r;
3462 }
3463 #endif
3464 
3465 /* This function attempts to create free entries in __kmp_threads and
3466    __kmp_root, and returns the number of free entries generated.
3467 
3468    For Windows* OS static library, the first mechanism used is to reclaim array
3469    entries for root threads that are already dead.
3470 
3471    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3472    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3473    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3474    threadprivate cache array has been created. Synchronization with
3475    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3476 
3477    After any dead root reclamation, if the clipping value allows array expansion
3478    to result in the generation of a total of nWish free slots, the function does
3479    that expansion. If not, but the clipping value allows array expansion to
3480    result in the generation of a total of nNeed free slots, the function does
3481    that expansion. Otherwise, nothing is done beyond the possible initial root
3482    thread reclamation. However, if nNeed is zero, a best-effort attempt is made
3483    to fulfil nWish as far as possible, i.e. the function will attempt to create
3484    as many free slots as possible up to nWish.
3485 
3486    If any argument is negative, the behavior is undefined. */
3487 static int __kmp_expand_threads(int nWish, int nNeed) {
3488   int added = 0;
3489   int old_tp_cached;
3490   int __kmp_actual_max_nth;
3491 
3492   if (nNeed > nWish) /* normalize the arguments */
3493     nWish = nNeed;
3494 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3495   /* only for Windows static library */
3496   /* reclaim array entries for root threads that are already dead */
3497   added = __kmp_reclaim_dead_roots();
3498 
3499   if (nNeed) {
3500     nNeed -= added;
3501     if (nNeed < 0)
3502       nNeed = 0;
3503   }
3504   if (nWish) {
3505     nWish -= added;
3506     if (nWish < 0)
3507       nWish = 0;
3508   }
3509 #endif
3510   if (nWish <= 0)
3511     return added;
3512 
3513   while (1) {
3514     int nTarget;
3515     int minimumRequiredCapacity;
3516     int newCapacity;
3517     kmp_info_t **newThreads;
3518     kmp_root_t **newRoot;
3519 
3520     // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3521     // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3522     // user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may become
3523     // > __kmp_max_nth in one of two ways:
3524     //
3525     // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3526     //    may not be resused by another thread, so we may need to increase
3527     //    __kmp_threads_capacity to __kmp_max_threads + 1.
3528     //
3529     // 2) New foreign root(s) are encountered.  We always register new foreign
3530     //    roots. This may cause a smaller # of threads to be allocated at
3531     //    subsequent parallel regions, but the worker threads hang around (and
3532     //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3533     //
3534     // Anyway, that is the reason for moving the check to see if
3535     // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3536     // instead of having it performed here. -BB
3537     old_tp_cached = __kmp_tp_cached;
3538     __kmp_actual_max_nth =
3539         old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3540     KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3541 
3542     /* compute expansion headroom to check if we can expand and whether to aim
3543        for nWish or nNeed */
3544     nTarget = nWish;
3545     if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3546       /* can't fulfil nWish, so try nNeed */
3547       if (nNeed) {
3548         nTarget = nNeed;
3549         if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3550           /* possible expansion too small -- give up */
3551           break;
3552         }
3553       } else {
3554         /* best-effort */
3555         nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3556         if (!nTarget) {
3557           /* can expand at all -- give up */
3558           break;
3559         }
3560       }
3561     }
3562     minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3563 
3564     newCapacity = __kmp_threads_capacity;
3565     do {
3566       newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1)
3567                         ? (newCapacity << 1)
3568                         : __kmp_actual_max_nth;
3569     } while (newCapacity < minimumRequiredCapacity);
3570     newThreads = (kmp_info_t **)__kmp_allocate(
3571         (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity +
3572         CACHE_LINE);
3573     newRoot = (kmp_root_t **)((char *)newThreads +
3574                               sizeof(kmp_info_t *) * newCapacity);
3575     KMP_MEMCPY(newThreads, __kmp_threads,
3576                __kmp_threads_capacity * sizeof(kmp_info_t *));
3577     KMP_MEMCPY(newRoot, __kmp_root,
3578                __kmp_threads_capacity * sizeof(kmp_root_t *));
3579     memset(newThreads + __kmp_threads_capacity, 0,
3580            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *));
3581     memset(newRoot + __kmp_threads_capacity, 0,
3582            (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *));
3583 
3584     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3585       /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has
3586          allocated a threadprivate cache while we were allocating the expanded
3587          array, and our new capacity is larger than the threadprivate cache
3588          capacity, so we should deallocate the expanded arrays and try again.
3589          This is the first check of a double-check pair. */
3590       __kmp_free(newThreads);
3591       continue; /* start over and try again */
3592     }
3593     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3594     if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3595       /* Same check as above, but this time with the lock so we can be sure if
3596          we can succeed. */
3597       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3598       __kmp_free(newThreads);
3599       continue; /* start over and try again */
3600     } else {
3601       /* success */
3602       // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be
3603       // investigated.
3604       *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3605       *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3606       added += newCapacity - __kmp_threads_capacity;
3607       *(volatile int *)&__kmp_threads_capacity = newCapacity;
3608       __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3609       break; /* succeeded, so we can exit the loop */
3610     }
3611   }
3612   return added;
3613 }
3614 
3615 /* Register the current thread as a root thread and obtain our gtid. We must
3616    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3617    thread that calls from __kmp_do_serial_initialize() */
3618 int __kmp_register_root(int initial_thread) {
3619   kmp_info_t *root_thread;
3620   kmp_root_t *root;
3621   int gtid;
3622   int capacity;
3623   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3624   KA_TRACE(20, ("__kmp_register_root: entered\n"));
3625   KMP_MB();
3626 
3627   /* 2007-03-02:
3628      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3629      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3630      work as expected -- it may return false (that means there is at least one
3631      empty slot in __kmp_threads array), but it is possible the only free slot
3632      is #0, which is reserved for initial thread and so cannot be used for this
3633      one. Following code workarounds this bug.
3634 
3635      However, right solution seems to be not reserving slot #0 for initial
3636      thread because:
3637      (1) there is no magic in slot #0,
3638      (2) we cannot detect initial thread reliably (the first thread which does
3639         serial initialization may be not a real initial thread).
3640   */
3641   capacity = __kmp_threads_capacity;
3642   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3643     --capacity;
3644   }; // if
3645 
3646   /* see if there are too many threads */
3647   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) {
3648     if (__kmp_tp_cached) {
3649       __kmp_msg(kmp_ms_fatal, KMP_MSG(CantRegisterNewThread),
3650                 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3651                 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3652     } else {
3653       __kmp_msg(kmp_ms_fatal, KMP_MSG(CantRegisterNewThread),
3654                 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
3655     }
3656   }; // if
3657 
3658   /* find an available thread slot */
3659   /* Don't reassign the zero slot since we need that to only be used by initial
3660      thread */
3661   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3662        gtid++)
3663     ;
3664   KA_TRACE(1,
3665            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3666   KMP_ASSERT(gtid < __kmp_threads_capacity);
3667 
3668   /* update global accounting */
3669   __kmp_all_nth++;
3670   TCW_4(__kmp_nth, __kmp_nth + 1);
3671 
3672   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3673   // numbers of procs, and method #2 (keyed API call) for higher numbers.
3674   if (__kmp_adjust_gtid_mode) {
3675     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3676       if (TCR_4(__kmp_gtid_mode) != 2) {
3677         TCW_4(__kmp_gtid_mode, 2);
3678       }
3679     } else {
3680       if (TCR_4(__kmp_gtid_mode) != 1) {
3681         TCW_4(__kmp_gtid_mode, 1);
3682       }
3683     }
3684   }
3685 
3686 #ifdef KMP_ADJUST_BLOCKTIME
3687   /* Adjust blocktime to zero if necessary            */
3688   /* Middle initialization might not have occurred yet */
3689   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3690     if (__kmp_nth > __kmp_avail_proc) {
3691       __kmp_zero_bt = TRUE;
3692     }
3693   }
3694 #endif /* KMP_ADJUST_BLOCKTIME */
3695 
3696   /* setup this new hierarchy */
3697   if (!(root = __kmp_root[gtid])) {
3698     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3699     KMP_DEBUG_ASSERT(!root->r.r_root_team);
3700   }
3701 
3702 #if KMP_STATS_ENABLED
3703   // Initialize stats as soon as possible (right after gtid assignment).
3704   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3705   KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3706   KMP_SET_THREAD_STATE(SERIAL_REGION);
3707   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3708 #endif
3709   __kmp_initialize_root(root);
3710 
3711   /* setup new root thread structure */
3712   if (root->r.r_uber_thread) {
3713     root_thread = root->r.r_uber_thread;
3714   } else {
3715     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3716     if (__kmp_storage_map) {
3717       __kmp_print_thread_storage_map(root_thread, gtid);
3718     }
3719     root_thread->th.th_info.ds.ds_gtid = gtid;
3720     root_thread->th.th_root = root;
3721     if (__kmp_env_consistency_check) {
3722       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3723     }
3724 #if USE_FAST_MEMORY
3725     __kmp_initialize_fast_memory(root_thread);
3726 #endif /* USE_FAST_MEMORY */
3727 
3728 #if KMP_USE_BGET
3729     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3730     __kmp_initialize_bget(root_thread);
3731 #endif
3732     __kmp_init_random(root_thread); // Initialize random number generator
3733   }
3734 
3735   /* setup the serial team held in reserve by the root thread */
3736   if (!root_thread->th.th_serial_team) {
3737     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3738     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3739     root_thread->th.th_serial_team =
3740         __kmp_allocate_team(root, 1, 1,
3741 #if OMPT_SUPPORT
3742                             0, // root parallel id
3743 #endif
3744 #if OMP_40_ENABLED
3745                             proc_bind_default,
3746 #endif
3747                             &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3748   }
3749   KMP_ASSERT(root_thread->th.th_serial_team);
3750   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3751                 root_thread->th.th_serial_team));
3752 
3753   /* drop root_thread into place */
3754   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3755 
3756   root->r.r_root_team->t.t_threads[0] = root_thread;
3757   root->r.r_hot_team->t.t_threads[0] = root_thread;
3758   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3759   // AC: the team created in reserve, not for execution (it is unused for now).
3760   root_thread->th.th_serial_team->t.t_serialized = 0;
3761   root->r.r_uber_thread = root_thread;
3762 
3763   /* initialize the thread, get it ready to go */
3764   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3765   TCW_4(__kmp_init_gtid, TRUE);
3766 
3767   /* prepare the master thread for get_gtid() */
3768   __kmp_gtid_set_specific(gtid);
3769 
3770 #if USE_ITT_BUILD
3771   __kmp_itt_thread_name(gtid);
3772 #endif /* USE_ITT_BUILD */
3773 
3774 #ifdef KMP_TDATA_GTID
3775   __kmp_gtid = gtid;
3776 #endif
3777   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3778   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3779 
3780   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3781                 "plain=%u\n",
3782                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3783                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3784                 KMP_INIT_BARRIER_STATE));
3785   { // Initialize barrier data.
3786     int b;
3787     for (b = 0; b < bs_last_barrier; ++b) {
3788       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3789 #if USE_DEBUGGER
3790       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3791 #endif
3792     }; // for
3793   }
3794   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3795                    KMP_INIT_BARRIER_STATE);
3796 
3797 #if KMP_AFFINITY_SUPPORTED
3798 #if OMP_40_ENABLED
3799   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3800   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3801   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3802   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3803 #endif
3804 
3805   if (TCR_4(__kmp_init_middle)) {
3806     __kmp_affinity_set_init_mask(gtid, TRUE);
3807   }
3808 #endif /* KMP_AFFINITY_SUPPORTED */
3809 
3810   __kmp_root_counter++;
3811 
3812   KMP_MB();
3813   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3814 
3815   return gtid;
3816 }
3817 
3818 #if KMP_NESTED_HOT_TEAMS
3819 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3820                                 const int max_level) {
3821   int i, n, nth;
3822   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3823   if (!hot_teams || !hot_teams[level].hot_team) {
3824     return 0;
3825   }
3826   KMP_DEBUG_ASSERT(level < max_level);
3827   kmp_team_t *team = hot_teams[level].hot_team;
3828   nth = hot_teams[level].hot_team_nth;
3829   n = nth - 1; // master is not freed
3830   if (level < max_level - 1) {
3831     for (i = 0; i < nth; ++i) {
3832       kmp_info_t *th = team->t.t_threads[i];
3833       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3834       if (i > 0 && th->th.th_hot_teams) {
3835         __kmp_free(th->th.th_hot_teams);
3836         th->th.th_hot_teams = NULL;
3837       }
3838     }
3839   }
3840   __kmp_free_team(root, team, NULL);
3841   return n;
3842 }
3843 #endif
3844 
3845 // Resets a root thread and clear its root and hot teams.
3846 // Returns the number of __kmp_threads entries directly and indirectly freed.
3847 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3848   kmp_team_t *root_team = root->r.r_root_team;
3849   kmp_team_t *hot_team = root->r.r_hot_team;
3850   int n = hot_team->t.t_nproc;
3851   int i;
3852 
3853   KMP_DEBUG_ASSERT(!root->r.r_active);
3854 
3855   root->r.r_root_team = NULL;
3856   root->r.r_hot_team = NULL;
3857   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3858   // before call to __kmp_free_team().
3859   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3860 #if KMP_NESTED_HOT_TEAMS
3861   if (__kmp_hot_teams_max_level >
3862       0) { // need to free nested hot teams and their threads if any
3863     for (i = 0; i < hot_team->t.t_nproc; ++i) {
3864       kmp_info_t *th = hot_team->t.t_threads[i];
3865       if (__kmp_hot_teams_max_level > 1) {
3866         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3867       }
3868       if (th->th.th_hot_teams) {
3869         __kmp_free(th->th.th_hot_teams);
3870         th->th.th_hot_teams = NULL;
3871       }
3872     }
3873   }
3874 #endif
3875   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3876 
3877   // Before we can reap the thread, we need to make certain that all other
3878   // threads in the teams that had this root as ancestor have stopped trying to
3879   // steal tasks.
3880   if (__kmp_tasking_mode != tskm_immediate_exec) {
3881     __kmp_wait_to_unref_task_teams();
3882   }
3883 
3884 #if KMP_OS_WINDOWS
3885   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3886   KA_TRACE(
3887       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3888            "\n",
3889            (LPVOID) & (root->r.r_uber_thread->th),
3890            root->r.r_uber_thread->th.th_info.ds.ds_thread));
3891   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3892 #endif /* KMP_OS_WINDOWS */
3893 
3894 #if OMPT_SUPPORT
3895   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3896     int gtid = __kmp_get_gtid();
3897     __ompt_thread_end(ompt_thread_initial, gtid);
3898   }
3899 #endif
3900 
3901   TCW_4(__kmp_nth,
3902         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3903   __kmp_reap_thread(root->r.r_uber_thread, 1);
3904 
3905   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3906   // of freeing.
3907   root->r.r_uber_thread = NULL;
3908   /* mark root as no longer in use */
3909   root->r.r_begin = FALSE;
3910 
3911   return n;
3912 }
3913 
3914 void __kmp_unregister_root_current_thread(int gtid) {
3915   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3916   /* this lock should be ok, since unregister_root_current_thread is never
3917      called during an abort, only during a normal close. furthermore, if you
3918      have the forkjoin lock, you should never try to get the initz lock */
3919   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3920   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3921     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3922                   "exiting T#%d\n",
3923                   gtid));
3924     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3925     return;
3926   }
3927   kmp_root_t *root = __kmp_root[gtid];
3928 
3929   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3930   KMP_ASSERT(KMP_UBER_GTID(gtid));
3931   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3932   KMP_ASSERT(root->r.r_active == FALSE);
3933 
3934   KMP_MB();
3935 
3936 #if OMP_45_ENABLED
3937   kmp_info_t *thread = __kmp_threads[gtid];
3938   kmp_team_t *team = thread->th.th_team;
3939   kmp_task_team_t *task_team = thread->th.th_task_team;
3940 
3941   // we need to wait for the proxy tasks before finishing the thread
3942   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3943 #if OMPT_SUPPORT
3944     // the runtime is shutting down so we won't report any events
3945     thread->th.ompt_thread_info.state = ompt_state_undefined;
3946 #endif
3947     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3948   }
3949 #endif
3950 
3951   __kmp_reset_root(gtid, root);
3952 
3953   /* free up this thread slot */
3954   __kmp_gtid_set_specific(KMP_GTID_DNE);
3955 #ifdef KMP_TDATA_GTID
3956   __kmp_gtid = KMP_GTID_DNE;
3957 #endif
3958 
3959   KMP_MB();
3960   KC_TRACE(10,
3961            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3962 
3963   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3964 }
3965 
3966 #if KMP_OS_WINDOWS
3967 /* __kmp_forkjoin_lock must be already held
3968    Unregisters a root thread that is not the current thread.  Returns the number
3969    of __kmp_threads entries freed as a result. */
3970 static int __kmp_unregister_root_other_thread(int gtid) {
3971   kmp_root_t *root = __kmp_root[gtid];
3972   int r;
3973 
3974   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3975   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3976   KMP_ASSERT(KMP_UBER_GTID(gtid));
3977   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3978   KMP_ASSERT(root->r.r_active == FALSE);
3979 
3980   r = __kmp_reset_root(gtid, root);
3981   KC_TRACE(10,
3982            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3983   return r;
3984 }
3985 #endif
3986 
3987 #if KMP_DEBUG
3988 void __kmp_task_info() {
3989 
3990   kmp_int32 gtid = __kmp_entry_gtid();
3991   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3992   kmp_info_t *this_thr = __kmp_threads[gtid];
3993   kmp_team_t *steam = this_thr->th.th_serial_team;
3994   kmp_team_t *team = this_thr->th.th_team;
3995 
3996   __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p "
3997                "ptask=%p\n",
3998                gtid, tid, this_thr, team, this_thr->th.th_current_task,
3999                team->t.t_implicit_task_taskdata[tid].td_parent);
4000 }
4001 #endif // KMP_DEBUG
4002 
4003 /* TODO optimize with one big memclr, take out what isn't needed, split
4004    responsibility to workers as much as possible, and delay initialization of
4005    features as much as possible  */
4006 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4007                                   int tid, int gtid) {
4008   /* this_thr->th.th_info.ds.ds_gtid is setup in
4009      kmp_allocate_thread/create_worker.
4010      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4011   kmp_info_t *master = team->t.t_threads[0];
4012   KMP_DEBUG_ASSERT(this_thr != NULL);
4013   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4014   KMP_DEBUG_ASSERT(team);
4015   KMP_DEBUG_ASSERT(team->t.t_threads);
4016   KMP_DEBUG_ASSERT(team->t.t_dispatch);
4017   KMP_DEBUG_ASSERT(master);
4018   KMP_DEBUG_ASSERT(master->th.th_root);
4019 
4020   KMP_MB();
4021 
4022   TCW_SYNC_PTR(this_thr->th.th_team, team);
4023 
4024   this_thr->th.th_info.ds.ds_tid = tid;
4025   this_thr->th.th_set_nproc = 0;
4026   if (__kmp_tasking_mode != tskm_immediate_exec)
4027     // When tasking is possible, threads are not safe to reap until they are
4028     // done tasking; this will be set when tasking code is exited in wait
4029     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4030   else // no tasking --> always safe to reap
4031     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4032 #if OMP_40_ENABLED
4033   this_thr->th.th_set_proc_bind = proc_bind_default;
4034 #if KMP_AFFINITY_SUPPORTED
4035   this_thr->th.th_new_place = this_thr->th.th_current_place;
4036 #endif
4037 #endif
4038   this_thr->th.th_root = master->th.th_root;
4039 
4040   /* setup the thread's cache of the team structure */
4041   this_thr->th.th_team_nproc = team->t.t_nproc;
4042   this_thr->th.th_team_master = master;
4043   this_thr->th.th_team_serialized = team->t.t_serialized;
4044   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4045 
4046   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4047 
4048   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4049                 tid, gtid, this_thr, this_thr->th.th_current_task));
4050 
4051   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4052                            team, tid, TRUE);
4053 
4054   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4055                 tid, gtid, this_thr, this_thr->th.th_current_task));
4056   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4057   // __kmp_initialize_team()?
4058 
4059   /* TODO no worksharing in speculative threads */
4060   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4061 
4062   this_thr->th.th_local.this_construct = 0;
4063 
4064 #ifdef BUILD_TV
4065   this_thr->th.th_local.tv_data = 0;
4066 #endif
4067 
4068   if (!this_thr->th.th_pri_common) {
4069     this_thr->th.th_pri_common =
4070         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4071     if (__kmp_storage_map) {
4072       __kmp_print_storage_map_gtid(
4073           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4074           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4075     }; // if
4076     this_thr->th.th_pri_head = NULL;
4077   }; // if
4078 
4079   /* Initialize dynamic dispatch */
4080   {
4081     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4082     // Use team max_nproc since this will never change for the team.
4083     size_t disp_size =
4084         sizeof(dispatch_private_info_t) *
4085         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4086     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4087                   team->t.t_max_nproc));
4088     KMP_ASSERT(dispatch);
4089     KMP_DEBUG_ASSERT(team->t.t_dispatch);
4090     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4091 
4092     dispatch->th_disp_index = 0;
4093 #if OMP_45_ENABLED
4094     dispatch->th_doacross_buf_idx = 0;
4095 #endif
4096     if (!dispatch->th_disp_buffer) {
4097       dispatch->th_disp_buffer =
4098           (dispatch_private_info_t *)__kmp_allocate(disp_size);
4099 
4100       if (__kmp_storage_map) {
4101         __kmp_print_storage_map_gtid(
4102             gtid, &dispatch->th_disp_buffer[0],
4103             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4104                                           ? 1
4105                                           : __kmp_dispatch_num_buffers],
4106             disp_size, "th_%d.th_dispatch.th_disp_buffer "
4107                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
4108             gtid, team->t.t_id, gtid);
4109       }
4110     } else {
4111       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4112     }
4113 
4114     dispatch->th_dispatch_pr_current = 0;
4115     dispatch->th_dispatch_sh_current = 0;
4116 
4117     dispatch->th_deo_fcn = 0; /* ORDERED     */
4118     dispatch->th_dxo_fcn = 0; /* END ORDERED */
4119   }
4120 
4121   this_thr->th.th_next_pool = NULL;
4122 
4123   if (!this_thr->th.th_task_state_memo_stack) {
4124     size_t i;
4125     this_thr->th.th_task_state_memo_stack =
4126         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4127     this_thr->th.th_task_state_top = 0;
4128     this_thr->th.th_task_state_stack_sz = 4;
4129     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4130          ++i) // zero init the stack
4131       this_thr->th.th_task_state_memo_stack[i] = 0;
4132   }
4133 
4134   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4135   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4136 
4137   KMP_MB();
4138 }
4139 
4140 /* allocate a new thread for the requesting team. this is only called from
4141    within a forkjoin critical section. we will first try to get an available
4142    thread from the thread pool. if none is available, we will fork a new one
4143    assuming we are able to create a new one. this should be assured, as the
4144    caller should check on this first. */
4145 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4146                                   int new_tid) {
4147   kmp_team_t *serial_team;
4148   kmp_info_t *new_thr;
4149   int new_gtid;
4150 
4151   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4152   KMP_DEBUG_ASSERT(root && team);
4153 #if !KMP_NESTED_HOT_TEAMS
4154   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4155 #endif
4156   KMP_MB();
4157 
4158   /* first, try to get one from the thread pool */
4159   if (__kmp_thread_pool) {
4160 
4161     new_thr = (kmp_info_t *)__kmp_thread_pool;
4162     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4163     if (new_thr == __kmp_thread_pool_insert_pt) {
4164       __kmp_thread_pool_insert_pt = NULL;
4165     }
4166     TCW_4(new_thr->th.th_in_pool, FALSE);
4167     // Don't touch th_active_in_pool or th_active.
4168     // The worker thread adjusts those flags as it sleeps/awakens.
4169     __kmp_thread_pool_nth--;
4170 
4171     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4172                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4173     KMP_ASSERT(!new_thr->th.th_team);
4174     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4175     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4176 
4177     /* setup the thread structure */
4178     __kmp_initialize_info(new_thr, team, new_tid,
4179                           new_thr->th.th_info.ds.ds_gtid);
4180     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4181 
4182     TCW_4(__kmp_nth, __kmp_nth + 1);
4183 
4184     new_thr->th.th_task_state = 0;
4185     new_thr->th.th_task_state_top = 0;
4186     new_thr->th.th_task_state_stack_sz = 4;
4187 
4188 #ifdef KMP_ADJUST_BLOCKTIME
4189     /* Adjust blocktime back to zero if necessary */
4190     /* Middle initialization might not have occurred yet */
4191     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4192       if (__kmp_nth > __kmp_avail_proc) {
4193         __kmp_zero_bt = TRUE;
4194       }
4195     }
4196 #endif /* KMP_ADJUST_BLOCKTIME */
4197 
4198 #if KMP_DEBUG
4199     // If thread entered pool via __kmp_free_thread, wait_flag should !=
4200     // KMP_BARRIER_PARENT_FLAG.
4201     int b;
4202     kmp_balign_t *balign = new_thr->th.th_bar;
4203     for (b = 0; b < bs_last_barrier; ++b)
4204       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4205 #endif
4206 
4207     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4208                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4209 
4210     KMP_MB();
4211     return new_thr;
4212   }
4213 
4214   /* no, well fork a new one */
4215   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4216   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4217 
4218 #if KMP_USE_MONITOR
4219   // If this is the first worker thread the RTL is creating, then also
4220   // launch the monitor thread.  We try to do this as early as possible.
4221   if (!TCR_4(__kmp_init_monitor)) {
4222     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4223     if (!TCR_4(__kmp_init_monitor)) {
4224       KF_TRACE(10, ("before __kmp_create_monitor\n"));
4225       TCW_4(__kmp_init_monitor, 1);
4226       __kmp_create_monitor(&__kmp_monitor);
4227       KF_TRACE(10, ("after __kmp_create_monitor\n"));
4228 #if KMP_OS_WINDOWS
4229       // AC: wait until monitor has started. This is a fix for CQ232808.
4230       // The reason is that if the library is loaded/unloaded in a loop with
4231       // small (parallel) work in between, then there is high probability that
4232       // monitor thread started after the library shutdown. At shutdown it is
4233       // too late to cope with the problem, because when the master is in
4234       // DllMain (process detach) the monitor has no chances to start (it is
4235       // blocked), and master has no means to inform the monitor that the
4236       // library has gone, because all the memory which the monitor can access
4237       // is going to be released/reset.
4238       while (TCR_4(__kmp_init_monitor) < 2) {
4239         KMP_YIELD(TRUE);
4240       }
4241       KF_TRACE(10, ("after monitor thread has started\n"));
4242 #endif
4243     }
4244     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4245   }
4246 #endif
4247 
4248   KMP_MB();
4249   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4250     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4251   }
4252 
4253   /* allocate space for it. */
4254   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4255 
4256   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4257 
4258   if (__kmp_storage_map) {
4259     __kmp_print_thread_storage_map(new_thr, new_gtid);
4260   }
4261 
4262   // add the reserve serialized team, initialized from the team's master thread
4263   {
4264     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4265     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4266     new_thr->th.th_serial_team = serial_team =
4267         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4268 #if OMPT_SUPPORT
4269                                           0, // root parallel id
4270 #endif
4271 #if OMP_40_ENABLED
4272                                           proc_bind_default,
4273 #endif
4274                                           &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4275   }
4276   KMP_ASSERT(serial_team);
4277   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4278   // execution (it is unused for now).
4279   serial_team->t.t_threads[0] = new_thr;
4280   KF_TRACE(10,
4281            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4282             new_thr));
4283 
4284   /* setup the thread structures */
4285   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4286 
4287 #if USE_FAST_MEMORY
4288   __kmp_initialize_fast_memory(new_thr);
4289 #endif /* USE_FAST_MEMORY */
4290 
4291 #if KMP_USE_BGET
4292   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4293   __kmp_initialize_bget(new_thr);
4294 #endif
4295 
4296   __kmp_init_random(new_thr); // Initialize random number generator
4297 
4298   /* Initialize these only once when thread is grabbed for a team allocation */
4299   KA_TRACE(20,
4300            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4301             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4302 
4303   int b;
4304   kmp_balign_t *balign = new_thr->th.th_bar;
4305   for (b = 0; b < bs_last_barrier; ++b) {
4306     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4307     balign[b].bb.team = NULL;
4308     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4309     balign[b].bb.use_oncore_barrier = 0;
4310   }
4311 
4312   new_thr->th.th_spin_here = FALSE;
4313   new_thr->th.th_next_waiting = 0;
4314 
4315 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4316   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4317   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4318   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4319   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4320 #endif
4321 
4322   TCW_4(new_thr->th.th_in_pool, FALSE);
4323   new_thr->th.th_active_in_pool = FALSE;
4324   TCW_4(new_thr->th.th_active, TRUE);
4325 
4326   /* adjust the global counters */
4327   __kmp_all_nth++;
4328   __kmp_nth++;
4329 
4330   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4331   // numbers of procs, and method #2 (keyed API call) for higher numbers.
4332   if (__kmp_adjust_gtid_mode) {
4333     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4334       if (TCR_4(__kmp_gtid_mode) != 2) {
4335         TCW_4(__kmp_gtid_mode, 2);
4336       }
4337     } else {
4338       if (TCR_4(__kmp_gtid_mode) != 1) {
4339         TCW_4(__kmp_gtid_mode, 1);
4340       }
4341     }
4342   }
4343 
4344 #ifdef KMP_ADJUST_BLOCKTIME
4345   /* Adjust blocktime back to zero if necessary       */
4346   /* Middle initialization might not have occurred yet */
4347   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4348     if (__kmp_nth > __kmp_avail_proc) {
4349       __kmp_zero_bt = TRUE;
4350     }
4351   }
4352 #endif /* KMP_ADJUST_BLOCKTIME */
4353 
4354   /* actually fork it and create the new worker thread */
4355   KF_TRACE(
4356       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4357   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4358   KF_TRACE(10,
4359            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4360 
4361   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4362                 new_gtid));
4363   KMP_MB();
4364   return new_thr;
4365 }
4366 
4367 /* Reinitialize team for reuse.
4368    The hot team code calls this case at every fork barrier, so EPCC barrier
4369    test are extremely sensitive to changes in it, esp. writes to the team
4370    struct, which cause a cache invalidation in all threads.
4371    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4372 static void __kmp_reinitialize_team(kmp_team_t *team,
4373                                     kmp_internal_control_t *new_icvs,
4374                                     ident_t *loc) {
4375   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4376                 team->t.t_threads[0], team));
4377   KMP_DEBUG_ASSERT(team && new_icvs);
4378   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4379   KMP_CHECK_UPDATE(team->t.t_ident, loc);
4380 
4381   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4382 
4383   // Copy ICVs to the master thread's implicit taskdata
4384   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4385   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4386 
4387   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4388                 team->t.t_threads[0], team));
4389 }
4390 
4391 /* Initialize the team data structure.
4392    This assumes the t_threads and t_max_nproc are already set.
4393    Also, we don't touch the arguments */
4394 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4395                                   kmp_internal_control_t *new_icvs,
4396                                   ident_t *loc) {
4397   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4398 
4399   /* verify */
4400   KMP_DEBUG_ASSERT(team);
4401   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4402   KMP_DEBUG_ASSERT(team->t.t_threads);
4403   KMP_MB();
4404 
4405   team->t.t_master_tid = 0; /* not needed */
4406   /* team->t.t_master_bar;        not needed */
4407   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4408   team->t.t_nproc = new_nproc;
4409 
4410   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4411   team->t.t_next_pool = NULL;
4412   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4413    * up hot team */
4414 
4415   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4416   team->t.t_invoke = NULL; /* not needed */
4417 
4418   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4419   team->t.t_sched = new_icvs->sched;
4420 
4421 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4422   team->t.t_fp_control_saved = FALSE; /* not needed */
4423   team->t.t_x87_fpu_control_word = 0; /* not needed */
4424   team->t.t_mxcsr = 0; /* not needed */
4425 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4426 
4427   team->t.t_construct = 0;
4428   __kmp_init_lock(&team->t.t_single_lock);
4429 
4430   team->t.t_ordered.dt.t_value = 0;
4431   team->t.t_master_active = FALSE;
4432 
4433   memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4434 
4435 #ifdef KMP_DEBUG
4436   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4437 #endif
4438   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4439 
4440   team->t.t_control_stack_top = NULL;
4441 
4442   __kmp_reinitialize_team(team, new_icvs, loc);
4443 
4444   KMP_MB();
4445   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4446 }
4447 
4448 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4449 /* Sets full mask for thread and returns old mask, no changes to structures. */
4450 static void
4451 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4452   if (KMP_AFFINITY_CAPABLE()) {
4453     int status;
4454     if (old_mask != NULL) {
4455       status = __kmp_get_system_affinity(old_mask, TRUE);
4456       int error = errno;
4457       if (status != 0) {
4458         __kmp_msg(kmp_ms_fatal, KMP_MSG(ChangeThreadAffMaskError),
4459                   KMP_ERR(error), __kmp_msg_null);
4460       }
4461     }
4462     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4463   }
4464 }
4465 #endif
4466 
4467 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4468 
4469 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4470 // It calculats the worker + master thread's partition based upon the parent
4471 // thread's partition, and binds each worker to a thread in their partition.
4472 // The master thread's partition should already include its current binding.
4473 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4474   // Copy the master thread's place partion to the team struct
4475   kmp_info_t *master_th = team->t.t_threads[0];
4476   KMP_DEBUG_ASSERT(master_th != NULL);
4477   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4478   int first_place = master_th->th.th_first_place;
4479   int last_place = master_th->th.th_last_place;
4480   int masters_place = master_th->th.th_current_place;
4481   team->t.t_first_place = first_place;
4482   team->t.t_last_place = last_place;
4483 
4484   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4485                 "bound to place %d partition = [%d,%d]\n",
4486                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4487                 team->t.t_id, masters_place, first_place, last_place));
4488 
4489   switch (proc_bind) {
4490 
4491   case proc_bind_default:
4492     // serial teams might have the proc_bind policy set to proc_bind_default. It
4493     // doesn't matter, as we don't rebind master thread for any proc_bind policy
4494     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4495     break;
4496 
4497   case proc_bind_master: {
4498     int f;
4499     int n_th = team->t.t_nproc;
4500     for (f = 1; f < n_th; f++) {
4501       kmp_info_t *th = team->t.t_threads[f];
4502       KMP_DEBUG_ASSERT(th != NULL);
4503       th->th.th_first_place = first_place;
4504       th->th.th_last_place = last_place;
4505       th->th.th_new_place = masters_place;
4506 
4507       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4508                      "partition = [%d,%d]\n",
4509                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4510                      f, masters_place, first_place, last_place));
4511     }
4512   } break;
4513 
4514   case proc_bind_close: {
4515     int f;
4516     int n_th = team->t.t_nproc;
4517     int n_places;
4518     if (first_place <= last_place) {
4519       n_places = last_place - first_place + 1;
4520     } else {
4521       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4522     }
4523     if (n_th <= n_places) {
4524       int place = masters_place;
4525       for (f = 1; f < n_th; f++) {
4526         kmp_info_t *th = team->t.t_threads[f];
4527         KMP_DEBUG_ASSERT(th != NULL);
4528 
4529         if (place == last_place) {
4530           place = first_place;
4531         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4532           place = 0;
4533         } else {
4534           place++;
4535         }
4536         th->th.th_first_place = first_place;
4537         th->th.th_last_place = last_place;
4538         th->th.th_new_place = place;
4539 
4540         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4541                        "partition = [%d,%d]\n",
4542                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4543                        team->t.t_id, f, place, first_place, last_place));
4544       }
4545     } else {
4546       int S, rem, gap, s_count;
4547       S = n_th / n_places;
4548       s_count = 0;
4549       rem = n_th - (S * n_places);
4550       gap = rem > 0 ? n_places / rem : n_places;
4551       int place = masters_place;
4552       int gap_ct = gap;
4553       for (f = 0; f < n_th; f++) {
4554         kmp_info_t *th = team->t.t_threads[f];
4555         KMP_DEBUG_ASSERT(th != NULL);
4556 
4557         th->th.th_first_place = first_place;
4558         th->th.th_last_place = last_place;
4559         th->th.th_new_place = place;
4560         s_count++;
4561 
4562         if ((s_count == S) && rem && (gap_ct == gap)) {
4563           // do nothing, add an extra thread to place on next iteration
4564         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4565           // we added an extra thread to this place; move to next place
4566           if (place == last_place) {
4567             place = first_place;
4568           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4569             place = 0;
4570           } else {
4571             place++;
4572           }
4573           s_count = 0;
4574           gap_ct = 1;
4575           rem--;
4576         } else if (s_count == S) { // place full; don't add extra
4577           if (place == last_place) {
4578             place = first_place;
4579           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4580             place = 0;
4581           } else {
4582             place++;
4583           }
4584           gap_ct++;
4585           s_count = 0;
4586         }
4587 
4588         KA_TRACE(100,
4589                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4590                   "partition = [%d,%d]\n",
4591                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4592                   th->th.th_new_place, first_place, last_place));
4593       }
4594       KMP_DEBUG_ASSERT(place == masters_place);
4595     }
4596   } break;
4597 
4598   case proc_bind_spread: {
4599     int f;
4600     int n_th = team->t.t_nproc;
4601     int n_places;
4602     int thidx;
4603     if (first_place <= last_place) {
4604       n_places = last_place - first_place + 1;
4605     } else {
4606       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4607     }
4608     if (n_th <= n_places) {
4609       int place = masters_place;
4610       int S = n_places / n_th;
4611       int s_count, rem, gap, gap_ct;
4612       rem = n_places - n_th * S;
4613       gap = rem ? n_th / rem : 1;
4614       gap_ct = gap;
4615       thidx = n_th;
4616       if (update_master_only == 1)
4617         thidx = 1;
4618       for (f = 0; f < thidx; f++) {
4619         kmp_info_t *th = team->t.t_threads[f];
4620         KMP_DEBUG_ASSERT(th != NULL);
4621 
4622         th->th.th_first_place = place;
4623         th->th.th_new_place = place;
4624         s_count = 1;
4625         while (s_count < S) {
4626           if (place == last_place) {
4627             place = first_place;
4628           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4629             place = 0;
4630           } else {
4631             place++;
4632           }
4633           s_count++;
4634         }
4635         if (rem && (gap_ct == gap)) {
4636           if (place == last_place) {
4637             place = first_place;
4638           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4639             place = 0;
4640           } else {
4641             place++;
4642           }
4643           rem--;
4644           gap_ct = 0;
4645         }
4646         th->th.th_last_place = place;
4647         gap_ct++;
4648 
4649         if (place == last_place) {
4650           place = first_place;
4651         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4652           place = 0;
4653         } else {
4654           place++;
4655         }
4656 
4657         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4658                        "partition = [%d,%d]\n",
4659                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4660                        team->t.t_id, f, th->th.th_new_place,
4661                        th->th.th_first_place, th->th.th_last_place));
4662       }
4663       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4664     } else {
4665       int S, rem, gap, s_count;
4666       S = n_th / n_places;
4667       s_count = 0;
4668       rem = n_th - (S * n_places);
4669       gap = rem > 0 ? n_places / rem : n_places;
4670       int place = masters_place;
4671       int gap_ct = gap;
4672       thidx = n_th;
4673       if (update_master_only == 1)
4674         thidx = 1;
4675       for (f = 0; f < thidx; f++) {
4676         kmp_info_t *th = team->t.t_threads[f];
4677         KMP_DEBUG_ASSERT(th != NULL);
4678 
4679         th->th.th_first_place = place;
4680         th->th.th_last_place = place;
4681         th->th.th_new_place = place;
4682         s_count++;
4683 
4684         if ((s_count == S) && rem && (gap_ct == gap)) {
4685           // do nothing, add an extra thread to place on next iteration
4686         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4687           // we added an extra thread to this place; move on to next place
4688           if (place == last_place) {
4689             place = first_place;
4690           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4691             place = 0;
4692           } else {
4693             place++;
4694           }
4695           s_count = 0;
4696           gap_ct = 1;
4697           rem--;
4698         } else if (s_count == S) { // place is full; don't add extra thread
4699           if (place == last_place) {
4700             place = first_place;
4701           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4702             place = 0;
4703           } else {
4704             place++;
4705           }
4706           gap_ct++;
4707           s_count = 0;
4708         }
4709 
4710         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4711                        "partition = [%d,%d]\n",
4712                        __kmp_gtid_from_thread(team->t.t_threads[f]),
4713                        team->t.t_id, f, th->th.th_new_place,
4714                        th->th.th_first_place, th->th.th_last_place));
4715       }
4716       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4717     }
4718   } break;
4719 
4720   default:
4721     break;
4722   }
4723 
4724   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4725 }
4726 
4727 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4728 
4729 /* allocate a new team data structure to use.  take one off of the free pool if
4730    available */
4731 kmp_team_t *
4732 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4733 #if OMPT_SUPPORT
4734                     ompt_parallel_id_t ompt_parallel_id,
4735 #endif
4736 #if OMP_40_ENABLED
4737                     kmp_proc_bind_t new_proc_bind,
4738 #endif
4739                     kmp_internal_control_t *new_icvs,
4740                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4741   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4742   int f;
4743   kmp_team_t *team;
4744   int use_hot_team = !root->r.r_active;
4745   int level = 0;
4746 
4747   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4748   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4749   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4750   KMP_MB();
4751 
4752 #if KMP_NESTED_HOT_TEAMS
4753   kmp_hot_team_ptr_t *hot_teams;
4754   if (master) {
4755     team = master->th.th_team;
4756     level = team->t.t_active_level;
4757     if (master->th.th_teams_microtask) { // in teams construct?
4758       if (master->th.th_teams_size.nteams > 1 &&
4759           ( // #teams > 1
4760               team->t.t_pkfn ==
4761                   (microtask_t)__kmp_teams_master || // inner fork of the teams
4762               master->th.th_teams_level <
4763                   team->t.t_level)) { // or nested parallel inside the teams
4764         ++level; // not increment if #teams==1, or for outer fork of the teams;
4765         // increment otherwise
4766       }
4767     }
4768     hot_teams = master->th.th_hot_teams;
4769     if (level < __kmp_hot_teams_max_level && hot_teams &&
4770         hot_teams[level]
4771             .hot_team) { // hot team has already been allocated for given level
4772       use_hot_team = 1;
4773     } else {
4774       use_hot_team = 0;
4775     }
4776   }
4777 #endif
4778   // Optimization to use a "hot" team
4779   if (use_hot_team && new_nproc > 1) {
4780     KMP_DEBUG_ASSERT(new_nproc == max_nproc);
4781 #if KMP_NESTED_HOT_TEAMS
4782     team = hot_teams[level].hot_team;
4783 #else
4784     team = root->r.r_hot_team;
4785 #endif
4786 #if KMP_DEBUG
4787     if (__kmp_tasking_mode != tskm_immediate_exec) {
4788       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4789                     "task_team[1] = %p before reinit\n",
4790                     team->t.t_task_team[0], team->t.t_task_team[1]));
4791     }
4792 #endif
4793 
4794     // Has the number of threads changed?
4795     /* Let's assume the most common case is that the number of threads is
4796        unchanged, and put that case first. */
4797     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4798       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4799       // This case can mean that omp_set_num_threads() was called and the hot
4800       // team size
4801       // was already reduced, so we check the special flag
4802       if (team->t.t_size_changed == -1) {
4803         team->t.t_size_changed = 1;
4804       } else {
4805         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4806       }
4807 
4808       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4809       kmp_r_sched_t new_sched = new_icvs->sched;
4810       if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4811           team->t.t_sched.chunk != new_sched.chunk)
4812         team->t.t_sched =
4813             new_sched; // set master's schedule as new run-time schedule
4814 
4815       __kmp_reinitialize_team(team, new_icvs,
4816                               root->r.r_uber_thread->th.th_ident);
4817 
4818       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4819                     team->t.t_threads[0], team));
4820       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4821 
4822 #if OMP_40_ENABLED
4823 #if KMP_AFFINITY_SUPPORTED
4824       if ((team->t.t_size_changed == 0) &&
4825           (team->t.t_proc_bind == new_proc_bind)) {
4826         if (new_proc_bind == proc_bind_spread) {
4827           __kmp_partition_places(
4828               team, 1); // add flag to update only master for spread
4829         }
4830         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4831                        "proc_bind = %d, partition = [%d,%d]\n",
4832                        team->t.t_id, new_proc_bind, team->t.t_first_place,
4833                        team->t.t_last_place));
4834       } else {
4835         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4836         __kmp_partition_places(team);
4837       }
4838 #else
4839       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4840 #endif /* KMP_AFFINITY_SUPPORTED */
4841 #endif /* OMP_40_ENABLED */
4842     } else if (team->t.t_nproc > new_nproc) {
4843       KA_TRACE(20,
4844                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4845                 new_nproc));
4846 
4847       team->t.t_size_changed = 1;
4848 #if KMP_NESTED_HOT_TEAMS
4849       if (__kmp_hot_teams_mode == 0) {
4850         // AC: saved number of threads should correspond to team's value in this
4851         // mode, can be bigger in mode 1, when hot team has threads in reserve
4852         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4853         hot_teams[level].hot_team_nth = new_nproc;
4854 #endif // KMP_NESTED_HOT_TEAMS
4855         /* release the extra threads we don't need any more */
4856         for (f = new_nproc; f < team->t.t_nproc; f++) {
4857           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4858           if (__kmp_tasking_mode != tskm_immediate_exec) {
4859             // When decreasing team size, threads no longer in the team should
4860             // unref task team.
4861             team->t.t_threads[f]->th.th_task_team = NULL;
4862           }
4863           __kmp_free_thread(team->t.t_threads[f]);
4864           team->t.t_threads[f] = NULL;
4865         }
4866 #if KMP_NESTED_HOT_TEAMS
4867       } // (__kmp_hot_teams_mode == 0)
4868       else {
4869         // When keeping extra threads in team, switch threads to wait on own
4870         // b_go flag
4871         for (f = new_nproc; f < team->t.t_nproc; ++f) {
4872           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4873           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4874           for (int b = 0; b < bs_last_barrier; ++b) {
4875             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4876               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4877             }
4878             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4879           }
4880         }
4881       }
4882 #endif // KMP_NESTED_HOT_TEAMS
4883       team->t.t_nproc = new_nproc;
4884       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4885       if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4886           team->t.t_sched.chunk != new_icvs->sched.chunk)
4887         team->t.t_sched = new_icvs->sched;
4888       __kmp_reinitialize_team(team, new_icvs,
4889                               root->r.r_uber_thread->th.th_ident);
4890 
4891       /* update the remaining threads */
4892       for (f = 0; f < new_nproc; ++f) {
4893         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4894       }
4895       // restore the current task state of the master thread: should be the
4896       // implicit task
4897       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4898                     team->t.t_threads[0], team));
4899 
4900       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4901 
4902 #ifdef KMP_DEBUG
4903       for (f = 0; f < team->t.t_nproc; f++) {
4904         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4905                          team->t.t_threads[f]->th.th_team_nproc ==
4906                              team->t.t_nproc);
4907       }
4908 #endif
4909 
4910 #if OMP_40_ENABLED
4911       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4912 #if KMP_AFFINITY_SUPPORTED
4913       __kmp_partition_places(team);
4914 #endif
4915 #endif
4916     } else { // team->t.t_nproc < new_nproc
4917 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4918       kmp_affin_mask_t *old_mask;
4919       if (KMP_AFFINITY_CAPABLE()) {
4920         KMP_CPU_ALLOC(old_mask);
4921       }
4922 #endif
4923 
4924       KA_TRACE(20,
4925                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4926                 new_nproc));
4927 
4928       team->t.t_size_changed = 1;
4929 
4930 #if KMP_NESTED_HOT_TEAMS
4931       int avail_threads = hot_teams[level].hot_team_nth;
4932       if (new_nproc < avail_threads)
4933         avail_threads = new_nproc;
4934       kmp_info_t **other_threads = team->t.t_threads;
4935       for (f = team->t.t_nproc; f < avail_threads; ++f) {
4936         // Adjust barrier data of reserved threads (if any) of the team
4937         // Other data will be set in __kmp_initialize_info() below.
4938         int b;
4939         kmp_balign_t *balign = other_threads[f]->th.th_bar;
4940         for (b = 0; b < bs_last_barrier; ++b) {
4941           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4942           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4943 #if USE_DEBUGGER
4944           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4945 #endif
4946         }
4947       }
4948       if (hot_teams[level].hot_team_nth >= new_nproc) {
4949         // we have all needed threads in reserve, no need to allocate any
4950         // this only possible in mode 1, cannot have reserved threads in mode 0
4951         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4952         team->t.t_nproc = new_nproc; // just get reserved threads involved
4953       } else {
4954         // we may have some threads in reserve, but not enough
4955         team->t.t_nproc =
4956             hot_teams[level]
4957                 .hot_team_nth; // get reserved threads involved if any
4958         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4959 #endif // KMP_NESTED_HOT_TEAMS
4960         if (team->t.t_max_nproc < new_nproc) {
4961           /* reallocate larger arrays */
4962           __kmp_reallocate_team_arrays(team, new_nproc);
4963           __kmp_reinitialize_team(team, new_icvs, NULL);
4964         }
4965 
4966 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4967         /* Temporarily set full mask for master thread before creation of
4968            workers. The reason is that workers inherit the affinity from master,
4969            so if a lot of workers are created on the single core quickly, they
4970            don't get a chance to set their own affinity for a long time. */
4971         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
4972 #endif
4973 
4974         /* allocate new threads for the hot team */
4975         for (f = team->t.t_nproc; f < new_nproc; f++) {
4976           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
4977           KMP_DEBUG_ASSERT(new_worker);
4978           team->t.t_threads[f] = new_worker;
4979 
4980           KA_TRACE(20,
4981                    ("__kmp_allocate_team: team %d init T#%d arrived: "
4982                     "join=%llu, plain=%llu\n",
4983                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
4984                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4985                     team->t.t_bar[bs_plain_barrier].b_arrived));
4986 
4987           { // Initialize barrier data for new threads.
4988             int b;
4989             kmp_balign_t *balign = new_worker->th.th_bar;
4990             for (b = 0; b < bs_last_barrier; ++b) {
4991               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4992               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
4993                                KMP_BARRIER_PARENT_FLAG);
4994 #if USE_DEBUGGER
4995               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4996 #endif
4997             }
4998           }
4999         }
5000 
5001 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5002         if (KMP_AFFINITY_CAPABLE()) {
5003           /* Restore initial master thread's affinity mask */
5004           __kmp_set_system_affinity(old_mask, TRUE);
5005           KMP_CPU_FREE(old_mask);
5006         }
5007 #endif
5008 #if KMP_NESTED_HOT_TEAMS
5009       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5010 #endif // KMP_NESTED_HOT_TEAMS
5011       /* make sure everyone is syncronized */
5012       int old_nproc = team->t.t_nproc; // save old value and use to update only
5013       // new threads below
5014       __kmp_initialize_team(team, new_nproc, new_icvs,
5015                             root->r.r_uber_thread->th.th_ident);
5016 
5017       /* reinitialize the threads */
5018       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5019       for (f = 0; f < team->t.t_nproc; ++f)
5020         __kmp_initialize_info(team->t.t_threads[f], team, f,
5021                               __kmp_gtid_from_tid(f, team));
5022       if (level) { // set th_task_state for new threads in nested hot team
5023         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5024         // only need to set the th_task_state for the new threads. th_task_state
5025         // for master thread will not be accurate until after this in
5026         // __kmp_fork_call(), so we look to the master's memo_stack to get the
5027         // correct value.
5028         for (f = old_nproc; f < team->t.t_nproc; ++f)
5029           team->t.t_threads[f]->th.th_task_state =
5030               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5031       } else { // set th_task_state for new threads in non-nested hot team
5032         int old_state =
5033             team->t.t_threads[0]->th.th_task_state; // copy master's state
5034         for (f = old_nproc; f < team->t.t_nproc; ++f)
5035           team->t.t_threads[f]->th.th_task_state = old_state;
5036       }
5037 
5038 #ifdef KMP_DEBUG
5039       for (f = 0; f < team->t.t_nproc; ++f) {
5040         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5041                          team->t.t_threads[f]->th.th_team_nproc ==
5042                              team->t.t_nproc);
5043       }
5044 #endif
5045 
5046 #if OMP_40_ENABLED
5047       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5048 #if KMP_AFFINITY_SUPPORTED
5049       __kmp_partition_places(team);
5050 #endif
5051 #endif
5052     } // Check changes in number of threads
5053 
5054 #if OMP_40_ENABLED
5055     kmp_info_t *master = team->t.t_threads[0];
5056     if (master->th.th_teams_microtask) {
5057       for (f = 1; f < new_nproc; ++f) {
5058         // propagate teams construct specific info to workers
5059         kmp_info_t *thr = team->t.t_threads[f];
5060         thr->th.th_teams_microtask = master->th.th_teams_microtask;
5061         thr->th.th_teams_level = master->th.th_teams_level;
5062         thr->th.th_teams_size = master->th.th_teams_size;
5063       }
5064     }
5065 #endif /* OMP_40_ENABLED */
5066 #if KMP_NESTED_HOT_TEAMS
5067     if (level) {
5068       // Sync barrier state for nested hot teams, not needed for outermost hot
5069       // team.
5070       for (f = 1; f < new_nproc; ++f) {
5071         kmp_info_t *thr = team->t.t_threads[f];
5072         int b;
5073         kmp_balign_t *balign = thr->th.th_bar;
5074         for (b = 0; b < bs_last_barrier; ++b) {
5075           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5076           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5077 #if USE_DEBUGGER
5078           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5079 #endif
5080         }
5081       }
5082     }
5083 #endif // KMP_NESTED_HOT_TEAMS
5084 
5085     /* reallocate space for arguments if necessary */
5086     __kmp_alloc_argv_entries(argc, team, TRUE);
5087     KMP_CHECK_UPDATE(team->t.t_argc, argc);
5088     // The hot team re-uses the previous task team,
5089     // if untouched during the previous release->gather phase.
5090 
5091     KF_TRACE(10, (" hot_team = %p\n", team));
5092 
5093 #if KMP_DEBUG
5094     if (__kmp_tasking_mode != tskm_immediate_exec) {
5095       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5096                     "task_team[1] = %p after reinit\n",
5097                     team->t.t_task_team[0], team->t.t_task_team[1]));
5098     }
5099 #endif
5100 
5101 #if OMPT_SUPPORT
5102     __ompt_team_assign_id(team, ompt_parallel_id);
5103 #endif
5104 
5105     KMP_MB();
5106 
5107     return team;
5108   }
5109 
5110   /* next, let's try to take one from the team pool */
5111   KMP_MB();
5112   for (team = (kmp_team_t *)__kmp_team_pool; (team);) {
5113     /* TODO: consider resizing undersized teams instead of reaping them, now
5114        that we have a resizing mechanism */
5115     if (team->t.t_max_nproc >= max_nproc) {
5116       /* take this team from the team pool */
5117       __kmp_team_pool = team->t.t_next_pool;
5118 
5119       /* setup the team for fresh use */
5120       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5121 
5122       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5123                     "task_team[1] %p to NULL\n",
5124                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
5125       team->t.t_task_team[0] = NULL;
5126       team->t.t_task_team[1] = NULL;
5127 
5128       /* reallocate space for arguments if necessary */
5129       __kmp_alloc_argv_entries(argc, team, TRUE);
5130       KMP_CHECK_UPDATE(team->t.t_argc, argc);
5131 
5132       KA_TRACE(
5133           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5134                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5135       { // Initialize barrier data.
5136         int b;
5137         for (b = 0; b < bs_last_barrier; ++b) {
5138           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5139 #if USE_DEBUGGER
5140           team->t.t_bar[b].b_master_arrived = 0;
5141           team->t.t_bar[b].b_team_arrived = 0;
5142 #endif
5143         }
5144       }
5145 
5146 #if OMP_40_ENABLED
5147       team->t.t_proc_bind = new_proc_bind;
5148 #endif
5149 
5150       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5151                     team->t.t_id));
5152 
5153 #if OMPT_SUPPORT
5154       __ompt_team_assign_id(team, ompt_parallel_id);
5155 #endif
5156 
5157       KMP_MB();
5158 
5159       return team;
5160     }
5161 
5162 /* reap team if it is too small, then loop back and check the next one */
5163 // not sure if this is wise, but, will be redone during the hot-teams rewrite.
5164 /* TODO: Use technique to find the right size hot-team, don't reap them */
5165     team = __kmp_reap_team(team);
5166     __kmp_team_pool = team;
5167   }
5168 
5169   /* nothing available in the pool, no matter, make a new team! */
5170   KMP_MB();
5171   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5172 
5173   /* and set it up */
5174   team->t.t_max_nproc = max_nproc;
5175   /* NOTE well, for some reason allocating one big buffer and dividing it up
5176      seems to really hurt performance a lot on the P4, so, let's not use this */
5177   __kmp_allocate_team_arrays(team, max_nproc);
5178 
5179   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5180   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5181 
5182   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5183                 "%p to NULL\n",
5184                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5185   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5186   // memory, no need to duplicate
5187   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5188   // memory, no need to duplicate
5189 
5190   if (__kmp_storage_map) {
5191     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5192   }
5193 
5194   /* allocate space for arguments */
5195   __kmp_alloc_argv_entries(argc, team, FALSE);
5196   team->t.t_argc = argc;
5197 
5198   KA_TRACE(20,
5199            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5200             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5201   { // Initialize barrier data.
5202     int b;
5203     for (b = 0; b < bs_last_barrier; ++b) {
5204       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5205 #if USE_DEBUGGER
5206       team->t.t_bar[b].b_master_arrived = 0;
5207       team->t.t_bar[b].b_team_arrived = 0;
5208 #endif
5209     }
5210   }
5211 
5212 #if OMP_40_ENABLED
5213   team->t.t_proc_bind = new_proc_bind;
5214 #endif
5215 
5216 #if OMPT_SUPPORT
5217   __ompt_team_assign_id(team, ompt_parallel_id);
5218   team->t.ompt_serialized_team_info = NULL;
5219 #endif
5220 
5221   KMP_MB();
5222 
5223   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5224                 team->t.t_id));
5225 
5226   return team;
5227 }
5228 
5229 /* TODO implement hot-teams at all levels */
5230 /* TODO implement lazy thread release on demand (disband request) */
5231 
5232 /* free the team.  return it to the team pool.  release all the threads
5233  * associated with it */
5234 void __kmp_free_team(kmp_root_t *root,
5235                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5236   int f;
5237   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5238                 team->t.t_id));
5239 
5240   /* verify state */
5241   KMP_DEBUG_ASSERT(root);
5242   KMP_DEBUG_ASSERT(team);
5243   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5244   KMP_DEBUG_ASSERT(team->t.t_threads);
5245 
5246   int use_hot_team = team == root->r.r_hot_team;
5247 #if KMP_NESTED_HOT_TEAMS
5248   int level;
5249   kmp_hot_team_ptr_t *hot_teams;
5250   if (master) {
5251     level = team->t.t_active_level - 1;
5252     if (master->th.th_teams_microtask) { // in teams construct?
5253       if (master->th.th_teams_size.nteams > 1) {
5254         ++level; // level was not increased in teams construct for
5255         // team_of_masters
5256       }
5257       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5258           master->th.th_teams_level == team->t.t_level) {
5259         ++level; // level was not increased in teams construct for
5260         // team_of_workers before the parallel
5261       } // team->t.t_level will be increased inside parallel
5262     }
5263     hot_teams = master->th.th_hot_teams;
5264     if (level < __kmp_hot_teams_max_level) {
5265       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5266       use_hot_team = 1;
5267     }
5268   }
5269 #endif // KMP_NESTED_HOT_TEAMS
5270 
5271   /* team is done working */
5272   TCW_SYNC_PTR(team->t.t_pkfn,
5273                NULL); // Important for Debugging Support Library.
5274   team->t.t_copyin_counter = 0; // init counter for possible reuse
5275   // Do not reset pointer to parent team to NULL for hot teams.
5276 
5277   /* if we are non-hot team, release our threads */
5278   if (!use_hot_team) {
5279     if (__kmp_tasking_mode != tskm_immediate_exec) {
5280       // Wait for threads to reach reapable state
5281       for (f = 1; f < team->t.t_nproc; ++f) {
5282         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5283         kmp_info_t *th = team->t.t_threads[f];
5284         volatile kmp_uint32 *state = &th->th.th_reap_state;
5285         while (*state != KMP_SAFE_TO_REAP) {
5286 #if KMP_OS_WINDOWS
5287           // On Windows a thread can be killed at any time, check this
5288           DWORD ecode;
5289           if (!__kmp_is_thread_alive(th, &ecode)) {
5290             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5291             break;
5292           }
5293 #endif
5294           // first check if thread is sleeping
5295           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5296           if (fl.is_sleeping())
5297             fl.resume(__kmp_gtid_from_thread(th));
5298           KMP_CPU_PAUSE();
5299         }
5300       }
5301 
5302       // Delete task teams
5303       int tt_idx;
5304       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5305         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5306         if (task_team != NULL) {
5307           for (f = 0; f < team->t.t_nproc;
5308                ++f) { // Have all threads unref task teams
5309             team->t.t_threads[f]->th.th_task_team = NULL;
5310           }
5311           KA_TRACE(
5312               20,
5313               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5314                __kmp_get_gtid(), task_team, team->t.t_id));
5315 #if KMP_NESTED_HOT_TEAMS
5316           __kmp_free_task_team(master, task_team);
5317 #endif
5318           team->t.t_task_team[tt_idx] = NULL;
5319         }
5320       }
5321     }
5322 
5323     // Reset pointer to parent team only for non-hot teams.
5324     team->t.t_parent = NULL;
5325     team->t.t_level = 0;
5326     team->t.t_active_level = 0;
5327 
5328     /* free the worker threads */
5329     for (f = 1; f < team->t.t_nproc; ++f) {
5330       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5331       __kmp_free_thread(team->t.t_threads[f]);
5332       team->t.t_threads[f] = NULL;
5333     }
5334 
5335     /* put the team back in the team pool */
5336     /* TODO limit size of team pool, call reap_team if pool too large */
5337     team->t.t_next_pool = (kmp_team_t *)__kmp_team_pool;
5338     __kmp_team_pool = (volatile kmp_team_t *)team;
5339   }
5340 
5341   KMP_MB();
5342 }
5343 
5344 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5345 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5346   kmp_team_t *next_pool = team->t.t_next_pool;
5347 
5348   KMP_DEBUG_ASSERT(team);
5349   KMP_DEBUG_ASSERT(team->t.t_dispatch);
5350   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5351   KMP_DEBUG_ASSERT(team->t.t_threads);
5352   KMP_DEBUG_ASSERT(team->t.t_argv);
5353 
5354   /* TODO clean the threads that are a part of this? */
5355 
5356   /* free stuff */
5357   __kmp_free_team_arrays(team);
5358   if (team->t.t_argv != &team->t.t_inline_argv[0])
5359     __kmp_free((void *)team->t.t_argv);
5360   __kmp_free(team);
5361 
5362   KMP_MB();
5363   return next_pool;
5364 }
5365 
5366 // Free the thread.  Don't reap it, just place it on the pool of available
5367 // threads.
5368 //
5369 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5370 // binding for the affinity mechanism to be useful.
5371 //
5372 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5373 // However, we want to avoid a potential performance problem by always
5374 // scanning through the list to find the correct point at which to insert
5375 // the thread (potential N**2 behavior).  To do this we keep track of the
5376 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5377 // With single-level parallelism, threads will always be added to the tail
5378 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5379 // parallelism, all bets are off and we may need to scan through the entire
5380 // free list.
5381 //
5382 // This change also has a potentially large performance benefit, for some
5383 // applications.  Previously, as threads were freed from the hot team, they
5384 // would be placed back on the free list in inverse order.  If the hot team
5385 // grew back to it's original size, then the freed thread would be placed
5386 // back on the hot team in reverse order.  This could cause bad cache
5387 // locality problems on programs where the size of the hot team regularly
5388 // grew and shrunk.
5389 //
5390 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5391 void __kmp_free_thread(kmp_info_t *this_th) {
5392   int gtid;
5393   kmp_info_t **scan;
5394 
5395   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5396                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5397 
5398   KMP_DEBUG_ASSERT(this_th);
5399 
5400   // When moving thread to pool, switch thread to wait on own b_go flag, and
5401   // uninitialized (NULL team).
5402   int b;
5403   kmp_balign_t *balign = this_th->th.th_bar;
5404   for (b = 0; b < bs_last_barrier; ++b) {
5405     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5406       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5407     balign[b].bb.team = NULL;
5408     balign[b].bb.leaf_kids = 0;
5409   }
5410   this_th->th.th_task_state = 0;
5411 
5412   /* put thread back on the free pool */
5413   TCW_PTR(this_th->th.th_team, NULL);
5414   TCW_PTR(this_th->th.th_root, NULL);
5415   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5416 
5417   // If the __kmp_thread_pool_insert_pt is already past the new insert
5418   // point, then we need to re-scan the entire list.
5419   gtid = this_th->th.th_info.ds.ds_gtid;
5420   if (__kmp_thread_pool_insert_pt != NULL) {
5421     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5422     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5423       __kmp_thread_pool_insert_pt = NULL;
5424     }
5425   }
5426 
5427   // Scan down the list to find the place to insert the thread.
5428   // scan is the address of a link in the list, possibly the address of
5429   // __kmp_thread_pool itself.
5430   //
5431   // In the absence of nested parallism, the for loop will have 0 iterations.
5432   if (__kmp_thread_pool_insert_pt != NULL) {
5433     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5434   } else {
5435     scan = (kmp_info_t **)&__kmp_thread_pool;
5436   }
5437   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5438        scan = &((*scan)->th.th_next_pool))
5439     ;
5440 
5441   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5442   // to its address.
5443   TCW_PTR(this_th->th.th_next_pool, *scan);
5444   __kmp_thread_pool_insert_pt = *scan = this_th;
5445   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5446                    (this_th->th.th_info.ds.ds_gtid <
5447                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5448   TCW_4(this_th->th.th_in_pool, TRUE);
5449   __kmp_thread_pool_nth++;
5450 
5451   TCW_4(__kmp_nth, __kmp_nth - 1);
5452 
5453 #ifdef KMP_ADJUST_BLOCKTIME
5454   /* Adjust blocktime back to user setting or default if necessary */
5455   /* Middle initialization might never have occurred                */
5456   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5457     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5458     if (__kmp_nth <= __kmp_avail_proc) {
5459       __kmp_zero_bt = FALSE;
5460     }
5461   }
5462 #endif /* KMP_ADJUST_BLOCKTIME */
5463 
5464   KMP_MB();
5465 }
5466 
5467 /* ------------------------------------------------------------------------ */
5468 
5469 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5470   int gtid = this_thr->th.th_info.ds.ds_gtid;
5471   /*    void                 *stack_data;*/
5472   kmp_team_t *(*volatile pteam);
5473 
5474   KMP_MB();
5475   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5476 
5477   if (__kmp_env_consistency_check) {
5478     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5479   }
5480 
5481 #if OMPT_SUPPORT
5482   if (ompt_enabled) {
5483     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5484     this_thr->th.ompt_thread_info.wait_id = 0;
5485     this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5486     if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5487       __ompt_thread_begin(ompt_thread_worker, gtid);
5488     }
5489   }
5490 #endif
5491 
5492   /* This is the place where threads wait for work */
5493   while (!TCR_4(__kmp_global.g.g_done)) {
5494     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5495     KMP_MB();
5496 
5497     /* wait for work to do */
5498     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5499 
5500 #if OMPT_SUPPORT
5501     if (ompt_enabled) {
5502       this_thr->th.ompt_thread_info.state = ompt_state_idle;
5503     }
5504 #endif
5505 
5506     /* No tid yet since not part of a team */
5507     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5508 
5509 #if OMPT_SUPPORT
5510     if (ompt_enabled) {
5511       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5512     }
5513 #endif
5514 
5515     pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5516 
5517     /* have we been allocated? */
5518     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5519 #if OMPT_SUPPORT
5520       ompt_task_info_t *task_info;
5521       ompt_parallel_id_t my_parallel_id;
5522       if (ompt_enabled) {
5523         task_info = __ompt_get_taskinfo(0);
5524         my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5525       }
5526 #endif
5527       /* we were just woken up, so run our new task */
5528       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5529         int rc;
5530         KA_TRACE(20,
5531                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5532                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5533                   (*pteam)->t.t_pkfn));
5534 
5535         updateHWFPControl(*pteam);
5536 
5537 #if OMPT_SUPPORT
5538         if (ompt_enabled) {
5539           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5540           // Initialize OMPT task id for implicit task.
5541           int tid = __kmp_tid_from_gtid(gtid);
5542           task_info->task_id = __ompt_task_id_new(tid);
5543         }
5544 #endif
5545 
5546         {
5547           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5548           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5549           rc = (*pteam)->t.t_invoke(gtid);
5550         }
5551         KMP_ASSERT(rc);
5552 
5553 #if OMPT_SUPPORT
5554         if (ompt_enabled) {
5555           /* no frame set while outside task */
5556           task_info->frame.exit_runtime_frame = NULL;
5557 
5558           this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5559         }
5560 #endif
5561         KMP_MB();
5562         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5563                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5564                       (*pteam)->t.t_pkfn));
5565       }
5566       /* join barrier after parallel region */
5567       __kmp_join_barrier(gtid);
5568 #if OMPT_SUPPORT && OMPT_TRACE
5569       if (ompt_enabled) {
5570         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5571           // don't access *pteam here: it may have already been freed
5572           // by the master thread behind the barrier (possible race)
5573           ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5574               my_parallel_id, task_info->task_id);
5575         }
5576         task_info->frame.exit_runtime_frame = NULL;
5577         task_info->task_id = 0;
5578       }
5579 #endif
5580     }
5581   }
5582   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5583 
5584 #if OMPT_SUPPORT
5585   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5586     __ompt_thread_end(ompt_thread_worker, gtid);
5587   }
5588 #endif
5589 
5590   this_thr->th.th_task_team = NULL;
5591   /* run the destructors for the threadprivate data for this thread */
5592   __kmp_common_destroy_gtid(gtid);
5593 
5594   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5595   KMP_MB();
5596   return this_thr;
5597 }
5598 
5599 /* ------------------------------------------------------------------------ */
5600 
5601 void __kmp_internal_end_dest(void *specific_gtid) {
5602 #if KMP_COMPILER_ICC
5603 #pragma warning(push)
5604 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5605 // significant bits
5606 #endif
5607   // Make sure no significant bits are lost
5608   int gtid = (kmp_intptr_t)specific_gtid - 1;
5609 #if KMP_COMPILER_ICC
5610 #pragma warning(pop)
5611 #endif
5612 
5613   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5614   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5615    * this is because 0 is reserved for the nothing-stored case */
5616 
5617   /* josh: One reason for setting the gtid specific data even when it is being
5618      destroyed by pthread is to allow gtid lookup through thread specific data
5619      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5620      that gets executed in the call to __kmp_internal_end_thread, actually
5621      gets the gtid through the thread specific data.  Setting it here seems
5622      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5623      to run smoothly.
5624      todo: get rid of this after we remove the dependence on
5625      __kmp_gtid_get_specific  */
5626   if (gtid >= 0 && KMP_UBER_GTID(gtid))
5627     __kmp_gtid_set_specific(gtid);
5628 #ifdef KMP_TDATA_GTID
5629   __kmp_gtid = gtid;
5630 #endif
5631   __kmp_internal_end_thread(gtid);
5632 }
5633 
5634 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5635 
5636 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5637 // destructors work perfectly, but in real libomp.so I have no evidence it is
5638 // ever called. However, -fini linker option in makefile.mk works fine.
5639 
5640 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5641   __kmp_internal_end_atexit();
5642 }
5643 
5644 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5645 
5646 #endif
5647 
5648 /* [Windows] josh: when the atexit handler is called, there may still be more
5649    than one thread alive */
5650 void __kmp_internal_end_atexit(void) {
5651   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5652   /* [Windows]
5653      josh: ideally, we want to completely shutdown the library in this atexit
5654      handler, but stat code that depends on thread specific data for gtid fails
5655      because that data becomes unavailable at some point during the shutdown, so
5656      we call __kmp_internal_end_thread instead. We should eventually remove the
5657      dependency on __kmp_get_specific_gtid in the stat code and use
5658      __kmp_internal_end_library to cleanly shutdown the library.
5659 
5660      // TODO: Can some of this comment about GVS be removed?
5661      I suspect that the offending stat code is executed when the calling thread
5662      tries to clean up a dead root thread's data structures, resulting in GVS
5663      code trying to close the GVS structures for that thread, but since the stat
5664      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5665      the calling thread is cleaning up itself instead of another thread, it get
5666      confused. This happens because allowing a thread to unregister and cleanup
5667      another thread is a recent modification for addressing an issue.
5668      Based on the current design (20050722), a thread may end up
5669      trying to unregister another thread only if thread death does not trigger
5670      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
5671      thread specific data destructor function to detect thread death. For
5672      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5673      is nothing.  Thus, the workaround is applicable only for Windows static
5674      stat library. */
5675   __kmp_internal_end_library(-1);
5676 #if KMP_OS_WINDOWS
5677   __kmp_close_console();
5678 #endif
5679 }
5680 
5681 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5682   // It is assumed __kmp_forkjoin_lock is acquired.
5683 
5684   int gtid;
5685 
5686   KMP_DEBUG_ASSERT(thread != NULL);
5687 
5688   gtid = thread->th.th_info.ds.ds_gtid;
5689 
5690   if (!is_root) {
5691 
5692     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5693       /* Assume the threads are at the fork barrier here */
5694       KA_TRACE(
5695           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5696                gtid));
5697       /* Need release fence here to prevent seg faults for tree forkjoin barrier
5698        * (GEH) */
5699       ANNOTATE_HAPPENS_BEFORE(thread);
5700       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5701       __kmp_release_64(&flag);
5702     }; // if
5703 
5704     // Terminate OS thread.
5705     __kmp_reap_worker(thread);
5706 
5707     // The thread was killed asynchronously.  If it was actively
5708     // spinning in the thread pool, decrement the global count.
5709     //
5710     // There is a small timing hole here - if the worker thread was just waking
5711     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5712     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5713     // the global counter might not get updated.
5714     //
5715     // Currently, this can only happen as the library is unloaded,
5716     // so there are no harmful side effects.
5717     if (thread->th.th_active_in_pool) {
5718       thread->th.th_active_in_pool = FALSE;
5719       KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
5720       KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
5721     }
5722 
5723     // Decrement # of [worker] threads in the pool.
5724     KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5725     --__kmp_thread_pool_nth;
5726   }; // if
5727 
5728   __kmp_free_implicit_task(thread);
5729 
5730 // Free the fast memory for tasking
5731 #if USE_FAST_MEMORY
5732   __kmp_free_fast_memory(thread);
5733 #endif /* USE_FAST_MEMORY */
5734 
5735   __kmp_suspend_uninitialize_thread(thread);
5736 
5737   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5738   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5739 
5740   --__kmp_all_nth;
5741 // __kmp_nth was decremented when thread is added to the pool.
5742 
5743 #ifdef KMP_ADJUST_BLOCKTIME
5744   /* Adjust blocktime back to user setting or default if necessary */
5745   /* Middle initialization might never have occurred                */
5746   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5747     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5748     if (__kmp_nth <= __kmp_avail_proc) {
5749       __kmp_zero_bt = FALSE;
5750     }
5751   }
5752 #endif /* KMP_ADJUST_BLOCKTIME */
5753 
5754   /* free the memory being used */
5755   if (__kmp_env_consistency_check) {
5756     if (thread->th.th_cons) {
5757       __kmp_free_cons_stack(thread->th.th_cons);
5758       thread->th.th_cons = NULL;
5759     }; // if
5760   }
5761 
5762   if (thread->th.th_pri_common != NULL) {
5763     __kmp_free(thread->th.th_pri_common);
5764     thread->th.th_pri_common = NULL;
5765   }; // if
5766 
5767   if (thread->th.th_task_state_memo_stack != NULL) {
5768     __kmp_free(thread->th.th_task_state_memo_stack);
5769     thread->th.th_task_state_memo_stack = NULL;
5770   }
5771 
5772 #if KMP_USE_BGET
5773   if (thread->th.th_local.bget_data != NULL) {
5774     __kmp_finalize_bget(thread);
5775   }; // if
5776 #endif
5777 
5778 #if KMP_AFFINITY_SUPPORTED
5779   if (thread->th.th_affin_mask != NULL) {
5780     KMP_CPU_FREE(thread->th.th_affin_mask);
5781     thread->th.th_affin_mask = NULL;
5782   }; // if
5783 #endif /* KMP_AFFINITY_SUPPORTED */
5784 
5785   __kmp_reap_team(thread->th.th_serial_team);
5786   thread->th.th_serial_team = NULL;
5787   __kmp_free(thread);
5788 
5789   KMP_MB();
5790 
5791 } // __kmp_reap_thread
5792 
5793 static void __kmp_internal_end(void) {
5794   int i;
5795 
5796   /* First, unregister the library */
5797   __kmp_unregister_library();
5798 
5799 #if KMP_OS_WINDOWS
5800   /* In Win static library, we can't tell when a root actually dies, so we
5801      reclaim the data structures for any root threads that have died but not
5802      unregistered themselves, in order to shut down cleanly.
5803      In Win dynamic library we also can't tell when a thread dies.  */
5804   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5805 // dead roots
5806 #endif
5807 
5808   for (i = 0; i < __kmp_threads_capacity; i++)
5809     if (__kmp_root[i])
5810       if (__kmp_root[i]->r.r_active)
5811         break;
5812   KMP_MB(); /* Flush all pending memory write invalidates.  */
5813   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5814 
5815   if (i < __kmp_threads_capacity) {
5816 #if KMP_USE_MONITOR
5817     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5818     KMP_MB(); /* Flush all pending memory write invalidates.  */
5819 
5820 // Need to check that monitor was initialized before reaping it. If we are
5821 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5822 // __kmp_monitor will appear to contain valid data, but it is only valid in the
5823 // parent process, not the child.
5824     // New behavior (201008): instead of keying off of the flag
5825     // __kmp_init_parallel, the monitor thread creation is keyed off
5826     // of the new flag __kmp_init_monitor.
5827     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5828     if (TCR_4(__kmp_init_monitor)) {
5829       __kmp_reap_monitor(&__kmp_monitor);
5830       TCW_4(__kmp_init_monitor, 0);
5831     }
5832     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5833     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5834 #endif // KMP_USE_MONITOR
5835   } else {
5836 /* TODO move this to cleanup code */
5837 #ifdef KMP_DEBUG
5838     /* make sure that everything has properly ended */
5839     for (i = 0; i < __kmp_threads_capacity; i++) {
5840       if (__kmp_root[i]) {
5841         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
5842         //                    there can be uber threads alive here
5843         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5844       }
5845     }
5846 #endif
5847 
5848     KMP_MB();
5849 
5850     // Reap the worker threads.
5851     // This is valid for now, but be careful if threads are reaped sooner.
5852     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5853       // Get the next thread from the pool.
5854       kmp_info_t *thread = (kmp_info_t *)__kmp_thread_pool;
5855       __kmp_thread_pool = thread->th.th_next_pool;
5856       // Reap it.
5857       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5858       thread->th.th_next_pool = NULL;
5859       thread->th.th_in_pool = FALSE;
5860       __kmp_reap_thread(thread, 0);
5861     }; // while
5862     __kmp_thread_pool_insert_pt = NULL;
5863 
5864     // Reap teams.
5865     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5866       // Get the next team from the pool.
5867       kmp_team_t *team = (kmp_team_t *)__kmp_team_pool;
5868       __kmp_team_pool = team->t.t_next_pool;
5869       // Reap it.
5870       team->t.t_next_pool = NULL;
5871       __kmp_reap_team(team);
5872     }; // while
5873 
5874     __kmp_reap_task_teams();
5875 
5876     for (i = 0; i < __kmp_threads_capacity; ++i) {
5877       // TBD: Add some checking...
5878       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5879     }
5880 
5881     /* Make sure all threadprivate destructors get run by joining with all
5882        worker threads before resetting this flag */
5883     TCW_SYNC_4(__kmp_init_common, FALSE);
5884 
5885     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
5886     KMP_MB();
5887 
5888 #if KMP_USE_MONITOR
5889     // See note above: One of the possible fixes for CQ138434 / CQ140126
5890     //
5891     // FIXME: push both code fragments down and CSE them?
5892     // push them into __kmp_cleanup() ?
5893     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5894     if (TCR_4(__kmp_init_monitor)) {
5895       __kmp_reap_monitor(&__kmp_monitor);
5896       TCW_4(__kmp_init_monitor, 0);
5897     }
5898     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5899     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5900 #endif
5901   } /* else !__kmp_global.t_active */
5902   TCW_4(__kmp_init_gtid, FALSE);
5903   KMP_MB(); /* Flush all pending memory write invalidates.  */
5904 
5905   __kmp_cleanup();
5906 #if OMPT_SUPPORT
5907   ompt_fini();
5908 #endif
5909 }
5910 
5911 void __kmp_internal_end_library(int gtid_req) {
5912   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5913   /* this shouldn't be a race condition because __kmp_internal_end() is the
5914      only place to clear __kmp_serial_init */
5915   /* we'll check this later too, after we get the lock */
5916   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
5917   // redundaant, because the next check will work in any case.
5918   if (__kmp_global.g.g_abort) {
5919     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
5920     /* TODO abort? */
5921     return;
5922   }
5923   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5924     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
5925     return;
5926   }
5927 
5928   KMP_MB(); /* Flush all pending memory write invalidates.  */
5929 
5930   /* find out who we are and what we should do */
5931   {
5932     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
5933     KA_TRACE(
5934         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
5935     if (gtid == KMP_GTID_SHUTDOWN) {
5936       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
5937                     "already shutdown\n"));
5938       return;
5939     } else if (gtid == KMP_GTID_MONITOR) {
5940       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
5941                     "registered, or system shutdown\n"));
5942       return;
5943     } else if (gtid == KMP_GTID_DNE) {
5944       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
5945                     "shutdown\n"));
5946       /* we don't know who we are, but we may still shutdown the library */
5947     } else if (KMP_UBER_GTID(gtid)) {
5948       /* unregister ourselves as an uber thread.  gtid is no longer valid */
5949       if (__kmp_root[gtid]->r.r_active) {
5950         __kmp_global.g.g_abort = -1;
5951         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5952         KA_TRACE(10,
5953                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
5954                   gtid));
5955         return;
5956       } else {
5957         KA_TRACE(
5958             10,
5959             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
5960         __kmp_unregister_root_current_thread(gtid);
5961       }
5962     } else {
5963 /* worker threads may call this function through the atexit handler, if they
5964  * call exit() */
5965 /* For now, skip the usual subsequent processing and just dump the debug buffer.
5966    TODO: do a thorough shutdown instead */
5967 #ifdef DUMP_DEBUG_ON_EXIT
5968       if (__kmp_debug_buf)
5969         __kmp_dump_debug_buffer();
5970 #endif
5971       return;
5972     }
5973   }
5974   /* synchronize the termination process */
5975   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
5976 
5977   /* have we already finished */
5978   if (__kmp_global.g.g_abort) {
5979     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
5980     /* TODO abort? */
5981     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5982     return;
5983   }
5984   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
5985     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
5986     return;
5987   }
5988 
5989   /* We need this lock to enforce mutex between this reading of
5990      __kmp_threads_capacity and the writing by __kmp_register_root.
5991      Alternatively, we can use a counter of roots that is atomically updated by
5992      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
5993      __kmp_internal_end_*.  */
5994   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
5995 
5996   /* now we can safely conduct the actual termination */
5997   __kmp_internal_end();
5998 
5999   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6000   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6001 
6002   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6003 
6004 #ifdef DUMP_DEBUG_ON_EXIT
6005   if (__kmp_debug_buf)
6006     __kmp_dump_debug_buffer();
6007 #endif
6008 
6009 #if KMP_OS_WINDOWS
6010   __kmp_close_console();
6011 #endif
6012 
6013   __kmp_fini_allocator();
6014 
6015 } // __kmp_internal_end_library
6016 
6017 void __kmp_internal_end_thread(int gtid_req) {
6018   int i;
6019 
6020   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6021   /* this shouldn't be a race condition because __kmp_internal_end() is the
6022    * only place to clear __kmp_serial_init */
6023   /* we'll check this later too, after we get the lock */
6024   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6025   // redundant, because the next check will work in any case.
6026   if (__kmp_global.g.g_abort) {
6027     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6028     /* TODO abort? */
6029     return;
6030   }
6031   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6032     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6033     return;
6034   }
6035 
6036   KMP_MB(); /* Flush all pending memory write invalidates.  */
6037 
6038   /* find out who we are and what we should do */
6039   {
6040     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6041     KA_TRACE(10,
6042              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6043     if (gtid == KMP_GTID_SHUTDOWN) {
6044       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6045                     "already shutdown\n"));
6046       return;
6047     } else if (gtid == KMP_GTID_MONITOR) {
6048       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6049                     "registered, or system shutdown\n"));
6050       return;
6051     } else if (gtid == KMP_GTID_DNE) {
6052       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6053                     "shutdown\n"));
6054       return;
6055       /* we don't know who we are */
6056     } else if (KMP_UBER_GTID(gtid)) {
6057       /* unregister ourselves as an uber thread.  gtid is no longer valid */
6058       if (__kmp_root[gtid]->r.r_active) {
6059         __kmp_global.g.g_abort = -1;
6060         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6061         KA_TRACE(10,
6062                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6063                   gtid));
6064         return;
6065       } else {
6066         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6067                       gtid));
6068         __kmp_unregister_root_current_thread(gtid);
6069       }
6070     } else {
6071       /* just a worker thread, let's leave */
6072       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6073 
6074       if (gtid >= 0) {
6075         __kmp_threads[gtid]->th.th_task_team = NULL;
6076       }
6077 
6078       KA_TRACE(10,
6079                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6080                 gtid));
6081       return;
6082     }
6083   }
6084 #if defined KMP_DYNAMIC_LIB
6085   // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6086   // thread, because we will better shutdown later in the library destructor.
6087   // The reason of this change is performance problem when non-openmp thread in
6088   // a loop forks and joins many openmp threads. We can save a lot of time
6089   // keeping worker threads alive until the program shutdown.
6090   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6091   // and Windows(DPD200287443) that occurs when using critical sections from
6092   // foreign threads.
6093   KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6094   return;
6095 #endif
6096   /* synchronize the termination process */
6097   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6098 
6099   /* have we already finished */
6100   if (__kmp_global.g.g_abort) {
6101     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6102     /* TODO abort? */
6103     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6104     return;
6105   }
6106   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6107     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6108     return;
6109   }
6110 
6111   /* We need this lock to enforce mutex between this reading of
6112      __kmp_threads_capacity and the writing by __kmp_register_root.
6113      Alternatively, we can use a counter of roots that is atomically updated by
6114      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6115      __kmp_internal_end_*.  */
6116 
6117   /* should we finish the run-time?  are all siblings done? */
6118   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6119 
6120   for (i = 0; i < __kmp_threads_capacity; ++i) {
6121     if (KMP_UBER_GTID(i)) {
6122       KA_TRACE(
6123           10,
6124           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6125       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6126       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6127       return;
6128     };
6129   }
6130 
6131   /* now we can safely conduct the actual termination */
6132 
6133   __kmp_internal_end();
6134 
6135   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6136   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6137 
6138   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6139 
6140 #ifdef DUMP_DEBUG_ON_EXIT
6141   if (__kmp_debug_buf)
6142     __kmp_dump_debug_buffer();
6143 #endif
6144 } // __kmp_internal_end_thread
6145 
6146 // -----------------------------------------------------------------------------
6147 // Library registration stuff.
6148 
6149 static long __kmp_registration_flag = 0;
6150 // Random value used to indicate library initialization.
6151 static char *__kmp_registration_str = NULL;
6152 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6153 
6154 static inline char *__kmp_reg_status_name() {
6155   /* On RHEL 3u5 if linked statically, getpid() returns different values in
6156      each thread. If registration and unregistration go in different threads
6157      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6158      env var can not be found, because the name will contain different pid. */
6159   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6160 } // __kmp_reg_status_get
6161 
6162 void __kmp_register_library_startup(void) {
6163 
6164   char *name = __kmp_reg_status_name(); // Name of the environment variable.
6165   int done = 0;
6166   union {
6167     double dtime;
6168     long ltime;
6169   } time;
6170 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6171   __kmp_initialize_system_tick();
6172 #endif
6173   __kmp_read_system_time(&time.dtime);
6174   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6175   __kmp_registration_str =
6176       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6177                        __kmp_registration_flag, KMP_LIBRARY_FILE);
6178 
6179   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6180                 __kmp_registration_str));
6181 
6182   while (!done) {
6183 
6184     char *value = NULL; // Actual value of the environment variable.
6185 
6186     // Set environment variable, but do not overwrite if it is exist.
6187     __kmp_env_set(name, __kmp_registration_str, 0);
6188     // Check the variable is written.
6189     value = __kmp_env_get(name);
6190     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6191 
6192       done = 1; // Ok, environment variable set successfully, exit the loop.
6193 
6194     } else {
6195 
6196       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6197       // Check whether it alive or dead.
6198       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6199       char *tail = value;
6200       char *flag_addr_str = NULL;
6201       char *flag_val_str = NULL;
6202       char const *file_name = NULL;
6203       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6204       __kmp_str_split(tail, '-', &flag_val_str, &tail);
6205       file_name = tail;
6206       if (tail != NULL) {
6207         long *flag_addr = 0;
6208         long flag_val = 0;
6209         KMP_SSCANF(flag_addr_str, "%p", &flag_addr);
6210         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6211         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6212           // First, check whether environment-encoded address is mapped into
6213           // addr space.
6214           // If so, dereference it to see if it still has the right value.
6215           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6216             neighbor = 1;
6217           } else {
6218             // If not, then we know the other copy of the library is no longer
6219             // running.
6220             neighbor = 2;
6221           }; // if
6222         }; // if
6223       }; // if
6224       switch (neighbor) {
6225       case 0: // Cannot parse environment variable -- neighbor status unknown.
6226         // Assume it is the incompatible format of future version of the
6227         // library. Assume the other library is alive.
6228         // WARN( ... ); // TODO: Issue a warning.
6229         file_name = "unknown library";
6230       // Attention! Falling to the next case. That's intentional.
6231       case 1: { // Neighbor is alive.
6232         // Check it is allowed.
6233         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6234         if (!__kmp_str_match_true(duplicate_ok)) {
6235           // That's not allowed. Issue fatal error.
6236           __kmp_msg(kmp_ms_fatal,
6237                     KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6238                     KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6239         }; // if
6240         KMP_INTERNAL_FREE(duplicate_ok);
6241         __kmp_duplicate_library_ok = 1;
6242         done = 1; // Exit the loop.
6243       } break;
6244       case 2: { // Neighbor is dead.
6245         // Clear the variable and try to register library again.
6246         __kmp_env_unset(name);
6247       } break;
6248       default: { KMP_DEBUG_ASSERT(0); } break;
6249       }; // switch
6250 
6251     }; // if
6252     KMP_INTERNAL_FREE((void *)value);
6253 
6254   }; // while
6255   KMP_INTERNAL_FREE((void *)name);
6256 
6257 } // func __kmp_register_library_startup
6258 
6259 void __kmp_unregister_library(void) {
6260 
6261   char *name = __kmp_reg_status_name();
6262   char *value = __kmp_env_get(name);
6263 
6264   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6265   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6266   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6267     // Ok, this is our variable. Delete it.
6268     __kmp_env_unset(name);
6269   }; // if
6270 
6271   KMP_INTERNAL_FREE(__kmp_registration_str);
6272   KMP_INTERNAL_FREE(value);
6273   KMP_INTERNAL_FREE(name);
6274 
6275   __kmp_registration_flag = 0;
6276   __kmp_registration_str = NULL;
6277 
6278 } // __kmp_unregister_library
6279 
6280 // End of Library registration stuff.
6281 // -----------------------------------------------------------------------------
6282 
6283 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6284 
6285 static void __kmp_check_mic_type() {
6286   kmp_cpuid_t cpuid_state = {0};
6287   kmp_cpuid_t *cs_p = &cpuid_state;
6288   __kmp_x86_cpuid(1, 0, cs_p);
6289   // We don't support mic1 at the moment
6290   if ((cs_p->eax & 0xff0) == 0xB10) {
6291     __kmp_mic_type = mic2;
6292   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6293     __kmp_mic_type = mic3;
6294   } else {
6295     __kmp_mic_type = non_mic;
6296   }
6297 }
6298 
6299 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6300 
6301 static void __kmp_do_serial_initialize(void) {
6302   int i, gtid;
6303   int size;
6304 
6305   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6306 
6307   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6308   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6309   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6310   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6311   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6312 
6313 #if OMPT_SUPPORT
6314   ompt_pre_init();
6315 #endif
6316 
6317   __kmp_validate_locks();
6318 
6319   /* Initialize internal memory allocator */
6320   __kmp_init_allocator();
6321 
6322   /* Register the library startup via an environment variable and check to see
6323      whether another copy of the library is already registered. */
6324 
6325   __kmp_register_library_startup();
6326 
6327   /* TODO reinitialization of library */
6328   if (TCR_4(__kmp_global.g.g_done)) {
6329     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6330   }
6331 
6332   __kmp_global.g.g_abort = 0;
6333   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6334 
6335 /* initialize the locks */
6336 #if KMP_USE_ADAPTIVE_LOCKS
6337 #if KMP_DEBUG_ADAPTIVE_LOCKS
6338   __kmp_init_speculative_stats();
6339 #endif
6340 #endif
6341 #if KMP_STATS_ENABLED
6342   __kmp_stats_init();
6343 #endif
6344   __kmp_init_lock(&__kmp_global_lock);
6345   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6346   __kmp_init_lock(&__kmp_debug_lock);
6347   __kmp_init_atomic_lock(&__kmp_atomic_lock);
6348   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6349   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6350   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6351   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6352   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6353   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6354   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6355   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6356   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6357   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6358   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6359   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6360   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6361   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6362 #if KMP_USE_MONITOR
6363   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6364 #endif
6365   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6366 
6367   /* conduct initialization and initial setup of configuration */
6368 
6369   __kmp_runtime_initialize();
6370 
6371 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6372   __kmp_check_mic_type();
6373 #endif
6374 
6375 // Some global variable initialization moved here from kmp_env_initialize()
6376 #ifdef KMP_DEBUG
6377   kmp_diag = 0;
6378 #endif
6379   __kmp_abort_delay = 0;
6380 
6381   // From __kmp_init_dflt_team_nth()
6382   /* assume the entire machine will be used */
6383   __kmp_dflt_team_nth_ub = __kmp_xproc;
6384   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6385     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6386   }
6387   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6388     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6389   }
6390   __kmp_max_nth = __kmp_sys_max_nth;
6391 
6392   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6393   // part
6394   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6395 #if KMP_USE_MONITOR
6396   __kmp_monitor_wakeups =
6397       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6398   __kmp_bt_intervals =
6399       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6400 #endif
6401   // From "KMP_LIBRARY" part of __kmp_env_initialize()
6402   __kmp_library = library_throughput;
6403   // From KMP_SCHEDULE initialization
6404   __kmp_static = kmp_sch_static_balanced;
6405 // AC: do not use analytical here, because it is non-monotonous
6406 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6407 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6408 // need to repeat assignment
6409 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6410 // bit control and barrier method control parts
6411 #if KMP_FAST_REDUCTION_BARRIER
6412 #define kmp_reduction_barrier_gather_bb ((int)1)
6413 #define kmp_reduction_barrier_release_bb ((int)1)
6414 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6415 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6416 #endif // KMP_FAST_REDUCTION_BARRIER
6417   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6418     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6419     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6420     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6421     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6422 #if KMP_FAST_REDUCTION_BARRIER
6423     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6424       // lin_64 ): hyper,1
6425       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6426       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6427       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6428       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6429     }
6430 #endif // KMP_FAST_REDUCTION_BARRIER
6431   }
6432 #if KMP_FAST_REDUCTION_BARRIER
6433 #undef kmp_reduction_barrier_release_pat
6434 #undef kmp_reduction_barrier_gather_pat
6435 #undef kmp_reduction_barrier_release_bb
6436 #undef kmp_reduction_barrier_gather_bb
6437 #endif // KMP_FAST_REDUCTION_BARRIER
6438 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6439   if (__kmp_mic_type == mic2) { // KNC
6440     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6441     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6442     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6443         1; // forkjoin release
6444     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6445     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6446   }
6447 #if KMP_FAST_REDUCTION_BARRIER
6448   if (__kmp_mic_type == mic2) { // KNC
6449     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6450     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6451   }
6452 #endif
6453 #endif
6454 
6455 // From KMP_CHECKS initialization
6456 #ifdef KMP_DEBUG
6457   __kmp_env_checks = TRUE; /* development versions have the extra checks */
6458 #else
6459   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6460 #endif
6461 
6462   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6463   __kmp_foreign_tp = TRUE;
6464 
6465   __kmp_global.g.g_dynamic = FALSE;
6466   __kmp_global.g.g_dynamic_mode = dynamic_default;
6467 
6468   __kmp_env_initialize(NULL);
6469 
6470 // Print all messages in message catalog for testing purposes.
6471 #ifdef KMP_DEBUG
6472   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6473   if (__kmp_str_match_true(val)) {
6474     kmp_str_buf_t buffer;
6475     __kmp_str_buf_init(&buffer);
6476     __kmp_i18n_dump_catalog(&buffer);
6477     __kmp_printf("%s", buffer.str);
6478     __kmp_str_buf_free(&buffer);
6479   }; // if
6480   __kmp_env_free(&val);
6481 #endif
6482 
6483   __kmp_threads_capacity =
6484       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6485   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6486   __kmp_tp_capacity = __kmp_default_tp_capacity(
6487       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6488 
6489   // If the library is shut down properly, both pools must be NULL. Just in
6490   // case, set them to NULL -- some memory may leak, but subsequent code will
6491   // work even if pools are not freed.
6492   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6493   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6494   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6495   __kmp_thread_pool = NULL;
6496   __kmp_thread_pool_insert_pt = NULL;
6497   __kmp_team_pool = NULL;
6498 
6499   /* Allocate all of the variable sized records */
6500   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6501    * expandable */
6502   /* Since allocation is cache-aligned, just add extra padding at the end */
6503   size =
6504       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6505       CACHE_LINE;
6506   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6507   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6508                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
6509 
6510   /* init thread counts */
6511   KMP_DEBUG_ASSERT(__kmp_all_nth ==
6512                    0); // Asserts fail if the library is reinitializing and
6513   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6514   __kmp_all_nth = 0;
6515   __kmp_nth = 0;
6516 
6517   /* setup the uber master thread and hierarchy */
6518   gtid = __kmp_register_root(TRUE);
6519   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
6520   KMP_ASSERT(KMP_UBER_GTID(gtid));
6521   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6522 
6523   KMP_MB(); /* Flush all pending memory write invalidates.  */
6524 
6525   __kmp_common_initialize();
6526 
6527 #if KMP_OS_UNIX
6528   /* invoke the child fork handler */
6529   __kmp_register_atfork();
6530 #endif
6531 
6532 #if !defined KMP_DYNAMIC_LIB
6533   {
6534     /* Invoke the exit handler when the program finishes, only for static
6535        library. For dynamic library, we already have _fini and DllMain. */
6536     int rc = atexit(__kmp_internal_end_atexit);
6537     if (rc != 0) {
6538       __kmp_msg(kmp_ms_fatal, KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6539                 __kmp_msg_null);
6540     }; // if
6541   }
6542 #endif
6543 
6544 #if KMP_HANDLE_SIGNALS
6545 #if KMP_OS_UNIX
6546   /* NOTE: make sure that this is called before the user installs their own
6547      signal handlers so that the user handlers are called first. this way they
6548      can return false, not call our handler, avoid terminating the library, and
6549      continue execution where they left off. */
6550   __kmp_install_signals(FALSE);
6551 #endif /* KMP_OS_UNIX */
6552 #if KMP_OS_WINDOWS
6553   __kmp_install_signals(TRUE);
6554 #endif /* KMP_OS_WINDOWS */
6555 #endif
6556 
6557   /* we have finished the serial initialization */
6558   __kmp_init_counter++;
6559 
6560   __kmp_init_serial = TRUE;
6561 
6562   if (__kmp_settings) {
6563     __kmp_env_print();
6564   }
6565 
6566 #if OMP_40_ENABLED
6567   if (__kmp_display_env || __kmp_display_env_verbose) {
6568     __kmp_env_print_2();
6569   }
6570 #endif // OMP_40_ENABLED
6571 
6572 #if OMPT_SUPPORT
6573   ompt_post_init();
6574 #endif
6575 
6576   KMP_MB();
6577 
6578   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6579 }
6580 
6581 void __kmp_serial_initialize(void) {
6582   if (__kmp_init_serial) {
6583     return;
6584   }
6585   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6586   if (__kmp_init_serial) {
6587     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6588     return;
6589   }
6590   __kmp_do_serial_initialize();
6591   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6592 }
6593 
6594 static void __kmp_do_middle_initialize(void) {
6595   int i, j;
6596   int prev_dflt_team_nth;
6597 
6598   if (!__kmp_init_serial) {
6599     __kmp_do_serial_initialize();
6600   }
6601 
6602   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6603 
6604   // Save the previous value for the __kmp_dflt_team_nth so that
6605   // we can avoid some reinitialization if it hasn't changed.
6606   prev_dflt_team_nth = __kmp_dflt_team_nth;
6607 
6608 #if KMP_AFFINITY_SUPPORTED
6609   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6610   // number of cores on the machine.
6611   __kmp_affinity_initialize();
6612 
6613   // Run through the __kmp_threads array and set the affinity mask
6614   // for each root thread that is currently registered with the RTL.
6615   for (i = 0; i < __kmp_threads_capacity; i++) {
6616     if (TCR_PTR(__kmp_threads[i]) != NULL) {
6617       __kmp_affinity_set_init_mask(i, TRUE);
6618     }
6619   }
6620 #endif /* KMP_AFFINITY_SUPPORTED */
6621 
6622   KMP_ASSERT(__kmp_xproc > 0);
6623   if (__kmp_avail_proc == 0) {
6624     __kmp_avail_proc = __kmp_xproc;
6625   }
6626 
6627   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6628   // correct them now
6629   j = 0;
6630   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6631     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6632         __kmp_avail_proc;
6633     j++;
6634   }
6635 
6636   if (__kmp_dflt_team_nth == 0) {
6637 #ifdef KMP_DFLT_NTH_CORES
6638     // Default #threads = #cores
6639     __kmp_dflt_team_nth = __kmp_ncores;
6640     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6641                   "__kmp_ncores (%d)\n",
6642                   __kmp_dflt_team_nth));
6643 #else
6644     // Default #threads = #available OS procs
6645     __kmp_dflt_team_nth = __kmp_avail_proc;
6646     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6647                   "__kmp_avail_proc(%d)\n",
6648                   __kmp_dflt_team_nth));
6649 #endif /* KMP_DFLT_NTH_CORES */
6650   }
6651 
6652   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6653     __kmp_dflt_team_nth = KMP_MIN_NTH;
6654   }
6655   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6656     __kmp_dflt_team_nth = __kmp_sys_max_nth;
6657   }
6658 
6659   // There's no harm in continuing if the following check fails,
6660   // but it indicates an error in the previous logic.
6661   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6662 
6663   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6664     // Run through the __kmp_threads array and set the num threads icv for each
6665     // root thread that is currently registered with the RTL (which has not
6666     // already explicitly set its nthreads-var with a call to
6667     // omp_set_num_threads()).
6668     for (i = 0; i < __kmp_threads_capacity; i++) {
6669       kmp_info_t *thread = __kmp_threads[i];
6670       if (thread == NULL)
6671         continue;
6672       if (thread->th.th_current_task->td_icvs.nproc != 0)
6673         continue;
6674 
6675       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6676     }
6677   }
6678   KA_TRACE(
6679       20,
6680       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6681        __kmp_dflt_team_nth));
6682 
6683 #ifdef KMP_ADJUST_BLOCKTIME
6684   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
6685   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6686     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6687     if (__kmp_nth > __kmp_avail_proc) {
6688       __kmp_zero_bt = TRUE;
6689     }
6690   }
6691 #endif /* KMP_ADJUST_BLOCKTIME */
6692 
6693   /* we have finished middle initialization */
6694   TCW_SYNC_4(__kmp_init_middle, TRUE);
6695 
6696   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6697 }
6698 
6699 void __kmp_middle_initialize(void) {
6700   if (__kmp_init_middle) {
6701     return;
6702   }
6703   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6704   if (__kmp_init_middle) {
6705     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6706     return;
6707   }
6708   __kmp_do_middle_initialize();
6709   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6710 }
6711 
6712 void __kmp_parallel_initialize(void) {
6713   int gtid = __kmp_entry_gtid(); // this might be a new root
6714 
6715   /* synchronize parallel initialization (for sibling) */
6716   if (TCR_4(__kmp_init_parallel))
6717     return;
6718   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6719   if (TCR_4(__kmp_init_parallel)) {
6720     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6721     return;
6722   }
6723 
6724   /* TODO reinitialization after we have already shut down */
6725   if (TCR_4(__kmp_global.g.g_done)) {
6726     KA_TRACE(
6727         10,
6728         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6729     __kmp_infinite_loop();
6730   }
6731 
6732   /* jc: The lock __kmp_initz_lock is already held, so calling
6733      __kmp_serial_initialize would cause a deadlock.  So we call
6734      __kmp_do_serial_initialize directly. */
6735   if (!__kmp_init_middle) {
6736     __kmp_do_middle_initialize();
6737   }
6738 
6739   /* begin initialization */
6740   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6741   KMP_ASSERT(KMP_UBER_GTID(gtid));
6742 
6743 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6744   // Save the FP control regs.
6745   // Worker threads will set theirs to these values at thread startup.
6746   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6747   __kmp_store_mxcsr(&__kmp_init_mxcsr);
6748   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6749 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6750 
6751 #if KMP_OS_UNIX
6752 #if KMP_HANDLE_SIGNALS
6753   /*  must be after __kmp_serial_initialize  */
6754   __kmp_install_signals(TRUE);
6755 #endif
6756 #endif
6757 
6758   __kmp_suspend_initialize();
6759 
6760 #if defined(USE_LOAD_BALANCE)
6761   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6762     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6763   }
6764 #else
6765   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6766     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6767   }
6768 #endif
6769 
6770   if (__kmp_version) {
6771     __kmp_print_version_2();
6772   }
6773 
6774   /* we have finished parallel initialization */
6775   TCW_SYNC_4(__kmp_init_parallel, TRUE);
6776 
6777   KMP_MB();
6778   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6779 
6780   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6781 }
6782 
6783 /* ------------------------------------------------------------------------ */
6784 
6785 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6786                                    kmp_team_t *team) {
6787   kmp_disp_t *dispatch;
6788 
6789   KMP_MB();
6790 
6791   /* none of the threads have encountered any constructs, yet. */
6792   this_thr->th.th_local.this_construct = 0;
6793 #if KMP_CACHE_MANAGE
6794   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6795 #endif /* KMP_CACHE_MANAGE */
6796   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6797   KMP_DEBUG_ASSERT(dispatch);
6798   KMP_DEBUG_ASSERT(team->t.t_dispatch);
6799   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6800   // this_thr->th.th_info.ds.ds_tid ] );
6801 
6802   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6803 #if OMP_45_ENABLED
6804   dispatch->th_doacross_buf_idx =
6805       0; /* reset the doacross dispatch buffer counter */
6806 #endif
6807   if (__kmp_env_consistency_check)
6808     __kmp_push_parallel(gtid, team->t.t_ident);
6809 
6810   KMP_MB(); /* Flush all pending memory write invalidates.  */
6811 }
6812 
6813 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6814                                   kmp_team_t *team) {
6815   if (__kmp_env_consistency_check)
6816     __kmp_pop_parallel(gtid, team->t.t_ident);
6817 
6818   __kmp_finish_implicit_task(this_thr);
6819 }
6820 
6821 int __kmp_invoke_task_func(int gtid) {
6822   int rc;
6823   int tid = __kmp_tid_from_gtid(gtid);
6824   kmp_info_t *this_thr = __kmp_threads[gtid];
6825   kmp_team_t *team = this_thr->th.th_team;
6826 
6827   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6828 #if USE_ITT_BUILD
6829   if (__itt_stack_caller_create_ptr) {
6830     __kmp_itt_stack_callee_enter(
6831         (__itt_caller)
6832             team->t.t_stack_id); // inform ittnotify about entering user's code
6833   }
6834 #endif /* USE_ITT_BUILD */
6835 #if INCLUDE_SSC_MARKS
6836   SSC_MARK_INVOKING();
6837 #endif
6838 
6839 #if OMPT_SUPPORT
6840   void *dummy;
6841   void **exit_runtime_p;
6842   ompt_task_id_t my_task_id;
6843   ompt_parallel_id_t my_parallel_id;
6844 
6845   if (ompt_enabled) {
6846     exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid]
6847                            .ompt_task_info.frame.exit_runtime_frame);
6848   } else {
6849     exit_runtime_p = &dummy;
6850   }
6851 
6852 #if OMPT_TRACE
6853   my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6854   my_parallel_id = team->t.ompt_team_info.parallel_id;
6855   if (ompt_enabled &&
6856       ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6857     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(my_parallel_id,
6858                                                                  my_task_id);
6859   }
6860 #endif
6861 #endif
6862 
6863   {
6864     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6865     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6866     rc =
6867         __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6868                                tid, (int)team->t.t_argc, (void **)team->t.t_argv
6869 #if OMPT_SUPPORT
6870                                ,
6871                                exit_runtime_p
6872 #endif
6873                                );
6874 #if OMPT_SUPPORT
6875     *exit_runtime_p = NULL;
6876 #endif
6877   }
6878 
6879 #if USE_ITT_BUILD
6880   if (__itt_stack_caller_create_ptr) {
6881     __kmp_itt_stack_callee_leave(
6882         (__itt_caller)
6883             team->t.t_stack_id); // inform ittnotify about leaving user's code
6884   }
6885 #endif /* USE_ITT_BUILD */
6886   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
6887 
6888   return rc;
6889 }
6890 
6891 #if OMP_40_ENABLED
6892 void __kmp_teams_master(int gtid) {
6893   // This routine is called by all master threads in teams construct
6894   kmp_info_t *thr = __kmp_threads[gtid];
6895   kmp_team_t *team = thr->th.th_team;
6896   ident_t *loc = team->t.t_ident;
6897   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6898   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
6899   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
6900   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
6901                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
6902 // Launch league of teams now, but not let workers execute
6903 // (they hang on fork barrier until next parallel)
6904 #if INCLUDE_SSC_MARKS
6905   SSC_MARK_FORKING();
6906 #endif
6907   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
6908 #if OMPT_SUPPORT
6909                   (void *)thr->th.th_teams_microtask, // "unwrapped" task
6910 #endif
6911                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6912                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
6913 #if INCLUDE_SSC_MARKS
6914   SSC_MARK_JOINING();
6915 #endif
6916 
6917   // AC: last parameter "1" eliminates join barrier which won't work because
6918   // worker threads are in a fork barrier waiting for more parallel regions
6919   __kmp_join_call(loc, gtid
6920 #if OMPT_SUPPORT
6921                   ,
6922                   fork_context_intel
6923 #endif
6924                   ,
6925                   1);
6926 }
6927 
6928 int __kmp_invoke_teams_master(int gtid) {
6929   kmp_info_t *this_thr = __kmp_threads[gtid];
6930   kmp_team_t *team = this_thr->th.th_team;
6931 #if KMP_DEBUG
6932   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
6933     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
6934                      (void *)__kmp_teams_master);
6935 #endif
6936   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
6937   __kmp_teams_master(gtid);
6938   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
6939   return 1;
6940 }
6941 #endif /* OMP_40_ENABLED */
6942 
6943 /* this sets the requested number of threads for the next parallel region
6944    encountered by this team. since this should be enclosed in the forkjoin
6945    critical section it should avoid race conditions with assymmetrical nested
6946    parallelism */
6947 
6948 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
6949   kmp_info_t *thr = __kmp_threads[gtid];
6950 
6951   if (num_threads > 0)
6952     thr->th.th_set_nproc = num_threads;
6953 }
6954 
6955 #if OMP_40_ENABLED
6956 
6957 /* this sets the requested number of teams for the teams region and/or
6958    the number of threads for the next parallel region encountered  */
6959 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
6960                           int num_threads) {
6961   kmp_info_t *thr = __kmp_threads[gtid];
6962   KMP_DEBUG_ASSERT(num_teams >= 0);
6963   KMP_DEBUG_ASSERT(num_threads >= 0);
6964 
6965   if (num_teams == 0)
6966     num_teams = 1; // default number of teams is 1.
6967   if (num_teams > __kmp_max_nth) { // if too many teams requested?
6968     if (!__kmp_reserve_warn) {
6969       __kmp_reserve_warn = 1;
6970       __kmp_msg(kmp_ms_warning,
6971                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_max_nth),
6972                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
6973     }
6974     num_teams = __kmp_max_nth;
6975   }
6976   // Set number of teams (number of threads in the outer "parallel" of the
6977   // teams)
6978   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6979 
6980   // Remember the number of threads for inner parallel regions
6981   if (num_threads == 0) {
6982     if (!TCR_4(__kmp_init_middle))
6983       __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6984     num_threads = __kmp_avail_proc / num_teams;
6985     if (num_teams * num_threads > __kmp_max_nth) {
6986       // adjust num_threads w/o warning as it is not user setting
6987       num_threads = __kmp_max_nth / num_teams;
6988     }
6989   } else {
6990     if (num_teams * num_threads > __kmp_max_nth) {
6991       int new_threads = __kmp_max_nth / num_teams;
6992       if (!__kmp_reserve_warn) { // user asked for too many threads
6993         __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6994         __kmp_msg(kmp_ms_warning,
6995                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
6996                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
6997       }
6998       num_threads = new_threads;
6999     }
7000   }
7001   thr->th.th_teams_size.nth = num_threads;
7002 }
7003 
7004 // Set the proc_bind var to use in the following parallel region.
7005 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7006   kmp_info_t *thr = __kmp_threads[gtid];
7007   thr->th.th_set_proc_bind = proc_bind;
7008 }
7009 
7010 #endif /* OMP_40_ENABLED */
7011 
7012 /* Launch the worker threads into the microtask. */
7013 
7014 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7015   kmp_info_t *this_thr = __kmp_threads[gtid];
7016 
7017 #ifdef KMP_DEBUG
7018   int f;
7019 #endif /* KMP_DEBUG */
7020 
7021   KMP_DEBUG_ASSERT(team);
7022   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7023   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7024   KMP_MB(); /* Flush all pending memory write invalidates.  */
7025 
7026   team->t.t_construct = 0; /* no single directives seen yet */
7027   team->t.t_ordered.dt.t_value =
7028       0; /* thread 0 enters the ordered section first */
7029 
7030   /* Reset the identifiers on the dispatch buffer */
7031   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7032   if (team->t.t_max_nproc > 1) {
7033     int i;
7034     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7035       team->t.t_disp_buffer[i].buffer_index = i;
7036 #if OMP_45_ENABLED
7037       team->t.t_disp_buffer[i].doacross_buf_idx = i;
7038 #endif
7039     }
7040   } else {
7041     team->t.t_disp_buffer[0].buffer_index = 0;
7042 #if OMP_45_ENABLED
7043     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7044 #endif
7045   }
7046 
7047   KMP_MB(); /* Flush all pending memory write invalidates.  */
7048   KMP_ASSERT(this_thr->th.th_team == team);
7049 
7050 #ifdef KMP_DEBUG
7051   for (f = 0; f < team->t.t_nproc; f++) {
7052     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7053                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7054   }
7055 #endif /* KMP_DEBUG */
7056 
7057   /* release the worker threads so they may begin working */
7058   __kmp_fork_barrier(gtid, 0);
7059 }
7060 
7061 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7062   kmp_info_t *this_thr = __kmp_threads[gtid];
7063 
7064   KMP_DEBUG_ASSERT(team);
7065   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7066   KMP_ASSERT(KMP_MASTER_GTID(gtid));
7067   KMP_MB(); /* Flush all pending memory write invalidates.  */
7068 
7069 /* Join barrier after fork */
7070 
7071 #ifdef KMP_DEBUG
7072   if (__kmp_threads[gtid] &&
7073       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7074     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7075                  __kmp_threads[gtid]);
7076     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7077                  "team->t.t_nproc=%d\n",
7078                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7079                  team->t.t_nproc);
7080     __kmp_print_structure();
7081   }
7082   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7083                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7084 #endif /* KMP_DEBUG */
7085 
7086   __kmp_join_barrier(gtid); /* wait for everyone */
7087 
7088   KMP_MB(); /* Flush all pending memory write invalidates.  */
7089   KMP_ASSERT(this_thr->th.th_team == team);
7090 }
7091 
7092 /* ------------------------------------------------------------------------ */
7093 
7094 #ifdef USE_LOAD_BALANCE
7095 
7096 // Return the worker threads actively spinning in the hot team, if we
7097 // are at the outermost level of parallelism.  Otherwise, return 0.
7098 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7099   int i;
7100   int retval;
7101   kmp_team_t *hot_team;
7102 
7103   if (root->r.r_active) {
7104     return 0;
7105   }
7106   hot_team = root->r.r_hot_team;
7107   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7108     return hot_team->t.t_nproc - 1; // Don't count master thread
7109   }
7110 
7111   // Skip the master thread - it is accounted for elsewhere.
7112   retval = 0;
7113   for (i = 1; i < hot_team->t.t_nproc; i++) {
7114     if (hot_team->t.t_threads[i]->th.th_active) {
7115       retval++;
7116     }
7117   }
7118   return retval;
7119 }
7120 
7121 // Perform an automatic adjustment to the number of
7122 // threads used by the next parallel region.
7123 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7124   int retval;
7125   int pool_active;
7126   int hot_team_active;
7127   int team_curr_active;
7128   int system_active;
7129 
7130   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7131                 set_nproc));
7132   KMP_DEBUG_ASSERT(root);
7133   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7134                        ->th.th_current_task->td_icvs.dynamic == TRUE);
7135   KMP_DEBUG_ASSERT(set_nproc > 1);
7136 
7137   if (set_nproc == 1) {
7138     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7139     return 1;
7140   }
7141 
7142   // Threads that are active in the thread pool, active in the hot team for this
7143   // particular root (if we are at the outer par level), and the currently
7144   // executing thread (to become the master) are available to add to the new
7145   // team, but are currently contributing to the system load, and must be
7146   // accounted for.
7147   pool_active = TCR_4(__kmp_thread_pool_active_nth);
7148   hot_team_active = __kmp_active_hot_team_nproc(root);
7149   team_curr_active = pool_active + hot_team_active + 1;
7150 
7151   // Check the system load.
7152   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7153   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7154                 "hot team active = %d\n",
7155                 system_active, pool_active, hot_team_active));
7156 
7157   if (system_active < 0) {
7158     // There was an error reading the necessary info from /proc, so use the
7159     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7160     // = dynamic_thread_limit, we shouldn't wind up getting back here.
7161     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7162     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7163 
7164     // Make this call behave like the thread limit algorithm.
7165     retval = __kmp_avail_proc - __kmp_nth +
7166              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7167     if (retval > set_nproc) {
7168       retval = set_nproc;
7169     }
7170     if (retval < KMP_MIN_NTH) {
7171       retval = KMP_MIN_NTH;
7172     }
7173 
7174     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7175                   retval));
7176     return retval;
7177   }
7178 
7179   // There is a slight delay in the load balance algorithm in detecting new
7180   // running procs. The real system load at this instant should be at least as
7181   // large as the #active omp thread that are available to add to the team.
7182   if (system_active < team_curr_active) {
7183     system_active = team_curr_active;
7184   }
7185   retval = __kmp_avail_proc - system_active + team_curr_active;
7186   if (retval > set_nproc) {
7187     retval = set_nproc;
7188   }
7189   if (retval < KMP_MIN_NTH) {
7190     retval = KMP_MIN_NTH;
7191   }
7192 
7193   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7194   return retval;
7195 } // __kmp_load_balance_nproc()
7196 
7197 #endif /* USE_LOAD_BALANCE */
7198 
7199 /* ------------------------------------------------------------------------ */
7200 
7201 /* NOTE: this is called with the __kmp_init_lock held */
7202 void __kmp_cleanup(void) {
7203   int f;
7204 
7205   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7206 
7207   if (TCR_4(__kmp_init_parallel)) {
7208 #if KMP_HANDLE_SIGNALS
7209     __kmp_remove_signals();
7210 #endif
7211     TCW_4(__kmp_init_parallel, FALSE);
7212   }
7213 
7214   if (TCR_4(__kmp_init_middle)) {
7215 #if KMP_AFFINITY_SUPPORTED
7216     __kmp_affinity_uninitialize();
7217 #endif /* KMP_AFFINITY_SUPPORTED */
7218     __kmp_cleanup_hierarchy();
7219     TCW_4(__kmp_init_middle, FALSE);
7220   }
7221 
7222   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7223 
7224   if (__kmp_init_serial) {
7225     __kmp_runtime_destroy();
7226     __kmp_init_serial = FALSE;
7227   }
7228 
7229   for (f = 0; f < __kmp_threads_capacity; f++) {
7230     if (__kmp_root[f] != NULL) {
7231       __kmp_free(__kmp_root[f]);
7232       __kmp_root[f] = NULL;
7233     }
7234   }
7235   __kmp_free(__kmp_threads);
7236   // __kmp_threads and __kmp_root were allocated at once, as single block, so
7237   // there is no need in freeing __kmp_root.
7238   __kmp_threads = NULL;
7239   __kmp_root = NULL;
7240   __kmp_threads_capacity = 0;
7241 
7242 #if KMP_USE_DYNAMIC_LOCK
7243   __kmp_cleanup_indirect_user_locks();
7244 #else
7245   __kmp_cleanup_user_locks();
7246 #endif
7247 
7248 #if KMP_AFFINITY_SUPPORTED
7249   KMP_INTERNAL_FREE((void *)__kmp_cpuinfo_file);
7250   __kmp_cpuinfo_file = NULL;
7251 #endif /* KMP_AFFINITY_SUPPORTED */
7252 
7253 #if KMP_USE_ADAPTIVE_LOCKS
7254 #if KMP_DEBUG_ADAPTIVE_LOCKS
7255   __kmp_print_speculative_stats();
7256 #endif
7257 #endif
7258   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7259   __kmp_nested_nth.nth = NULL;
7260   __kmp_nested_nth.size = 0;
7261   __kmp_nested_nth.used = 0;
7262   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7263   __kmp_nested_proc_bind.bind_types = NULL;
7264   __kmp_nested_proc_bind.size = 0;
7265   __kmp_nested_proc_bind.used = 0;
7266 
7267   __kmp_i18n_catclose();
7268 
7269 #if KMP_STATS_ENABLED
7270   __kmp_stats_fini();
7271 #endif
7272 
7273   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7274 }
7275 
7276 /* ------------------------------------------------------------------------ */
7277 
7278 int __kmp_ignore_mppbeg(void) {
7279   char *env;
7280 
7281   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7282     if (__kmp_str_match_false(env))
7283       return FALSE;
7284   }
7285   // By default __kmpc_begin() is no-op.
7286   return TRUE;
7287 }
7288 
7289 int __kmp_ignore_mppend(void) {
7290   char *env;
7291 
7292   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7293     if (__kmp_str_match_false(env))
7294       return FALSE;
7295   }
7296   // By default __kmpc_end() is no-op.
7297   return TRUE;
7298 }
7299 
7300 void __kmp_internal_begin(void) {
7301   int gtid;
7302   kmp_root_t *root;
7303 
7304   /* this is a very important step as it will register new sibling threads
7305      and assign these new uber threads a new gtid */
7306   gtid = __kmp_entry_gtid();
7307   root = __kmp_threads[gtid]->th.th_root;
7308   KMP_ASSERT(KMP_UBER_GTID(gtid));
7309 
7310   if (root->r.r_begin)
7311     return;
7312   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7313   if (root->r.r_begin) {
7314     __kmp_release_lock(&root->r.r_begin_lock, gtid);
7315     return;
7316   }
7317 
7318   root->r.r_begin = TRUE;
7319 
7320   __kmp_release_lock(&root->r.r_begin_lock, gtid);
7321 }
7322 
7323 /* ------------------------------------------------------------------------ */
7324 
7325 void __kmp_user_set_library(enum library_type arg) {
7326   int gtid;
7327   kmp_root_t *root;
7328   kmp_info_t *thread;
7329 
7330   /* first, make sure we are initialized so we can get our gtid */
7331 
7332   gtid = __kmp_entry_gtid();
7333   thread = __kmp_threads[gtid];
7334 
7335   root = thread->th.th_root;
7336 
7337   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7338                 library_serial));
7339   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7340                                   thread */
7341     KMP_WARNING(SetLibraryIncorrectCall);
7342     return;
7343   }
7344 
7345   switch (arg) {
7346   case library_serial:
7347     thread->th.th_set_nproc = 0;
7348     set__nproc(thread, 1);
7349     break;
7350   case library_turnaround:
7351     thread->th.th_set_nproc = 0;
7352     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7353                                            : __kmp_dflt_team_nth_ub);
7354     break;
7355   case library_throughput:
7356     thread->th.th_set_nproc = 0;
7357     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7358                                            : __kmp_dflt_team_nth_ub);
7359     break;
7360   default:
7361     KMP_FATAL(UnknownLibraryType, arg);
7362   }
7363 
7364   __kmp_aux_set_library(arg);
7365 }
7366 
7367 void __kmp_aux_set_stacksize(size_t arg) {
7368   if (!__kmp_init_serial)
7369     __kmp_serial_initialize();
7370 
7371 #if KMP_OS_DARWIN
7372   if (arg & (0x1000 - 1)) {
7373     arg &= ~(0x1000 - 1);
7374     if (arg + 0x1000) /* check for overflow if we round up */
7375       arg += 0x1000;
7376   }
7377 #endif
7378   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7379 
7380   /* only change the default stacksize before the first parallel region */
7381   if (!TCR_4(__kmp_init_parallel)) {
7382     size_t value = arg; /* argument is in bytes */
7383 
7384     if (value < __kmp_sys_min_stksize)
7385       value = __kmp_sys_min_stksize;
7386     else if (value > KMP_MAX_STKSIZE)
7387       value = KMP_MAX_STKSIZE;
7388 
7389     __kmp_stksize = value;
7390 
7391     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7392   }
7393 
7394   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7395 }
7396 
7397 /* set the behaviour of the runtime library */
7398 /* TODO this can cause some odd behaviour with sibling parallelism... */
7399 void __kmp_aux_set_library(enum library_type arg) {
7400   __kmp_library = arg;
7401 
7402   switch (__kmp_library) {
7403   case library_serial: {
7404     KMP_INFORM(LibraryIsSerial);
7405     (void)__kmp_change_library(TRUE);
7406   } break;
7407   case library_turnaround:
7408     (void)__kmp_change_library(TRUE);
7409     break;
7410   case library_throughput:
7411     (void)__kmp_change_library(FALSE);
7412     break;
7413   default:
7414     KMP_FATAL(UnknownLibraryType, arg);
7415   }
7416 }
7417 
7418 /* ------------------------------------------------------------------------ */
7419 
7420 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7421   int blocktime = arg; /* argument is in milliseconds */
7422 #if KMP_USE_MONITOR
7423   int bt_intervals;
7424 #endif
7425   int bt_set;
7426 
7427   __kmp_save_internal_controls(thread);
7428 
7429   /* Normalize and set blocktime for the teams */
7430   if (blocktime < KMP_MIN_BLOCKTIME)
7431     blocktime = KMP_MIN_BLOCKTIME;
7432   else if (blocktime > KMP_MAX_BLOCKTIME)
7433     blocktime = KMP_MAX_BLOCKTIME;
7434 
7435   set__blocktime_team(thread->th.th_team, tid, blocktime);
7436   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7437 
7438 #if KMP_USE_MONITOR
7439   /* Calculate and set blocktime intervals for the teams */
7440   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7441 
7442   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7443   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7444 #endif
7445 
7446   /* Set whether blocktime has been set to "TRUE" */
7447   bt_set = TRUE;
7448 
7449   set__bt_set_team(thread->th.th_team, tid, bt_set);
7450   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7451 #if KMP_USE_MONITOR
7452   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7453                 "bt_intervals=%d, monitor_updates=%d\n",
7454                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7455                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7456                 __kmp_monitor_wakeups));
7457 #else
7458   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7459                 __kmp_gtid_from_tid(tid, thread->th.th_team),
7460                 thread->th.th_team->t.t_id, tid, blocktime));
7461 #endif
7462 }
7463 
7464 void __kmp_aux_set_defaults(char const *str, int len) {
7465   if (!__kmp_init_serial) {
7466     __kmp_serial_initialize();
7467   };
7468   __kmp_env_initialize(str);
7469 
7470   if (__kmp_settings
7471 #if OMP_40_ENABLED
7472       || __kmp_display_env || __kmp_display_env_verbose
7473 #endif // OMP_40_ENABLED
7474       ) {
7475     __kmp_env_print();
7476   }
7477 } // __kmp_aux_set_defaults
7478 
7479 /* ------------------------------------------------------------------------ */
7480 /* internal fast reduction routines */
7481 
7482 PACKED_REDUCTION_METHOD_T
7483 __kmp_determine_reduction_method(
7484     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7485     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7486     kmp_critical_name *lck) {
7487 
7488   // Default reduction method: critical construct ( lck != NULL, like in current
7489   // PAROPT )
7490   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7491   // can be selected by RTL
7492   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7493   // can be selected by RTL
7494   // Finally, it's up to OpenMP RTL to make a decision on which method to select
7495   // among generated by PAROPT.
7496 
7497   PACKED_REDUCTION_METHOD_T retval;
7498 
7499   int team_size;
7500 
7501   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
7502   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
7503 
7504 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
7505   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
7506 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
7507 
7508   retval = critical_reduce_block;
7509 
7510   // another choice of getting a team size (with 1 dynamic deference) is slower
7511   team_size = __kmp_get_team_num_threads(global_tid);
7512   if (team_size == 1) {
7513 
7514     retval = empty_reduce_block;
7515 
7516   } else {
7517 
7518     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7519     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7520 
7521 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7522 
7523 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||       \
7524     KMP_OS_DARWIN
7525 
7526     int teamsize_cutoff = 4;
7527 
7528 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7529     if (__kmp_mic_type != non_mic) {
7530       teamsize_cutoff = 8;
7531     }
7532 #endif
7533     if (tree_available) {
7534       if (team_size <= teamsize_cutoff) {
7535         if (atomic_available) {
7536           retval = atomic_reduce_block;
7537         }
7538       } else {
7539         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7540       }
7541     } else if (atomic_available) {
7542       retval = atomic_reduce_block;
7543     }
7544 #else
7545 #error "Unknown or unsupported OS"
7546 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS ||
7547 // KMP_OS_DARWIN
7548 
7549 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7550 
7551 #if KMP_OS_LINUX || KMP_OS_WINDOWS
7552 
7553     // basic tuning
7554 
7555     if (atomic_available) {
7556       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
7557         retval = atomic_reduce_block;
7558       }
7559     } // otherwise: use critical section
7560 
7561 #elif KMP_OS_DARWIN
7562 
7563     if (atomic_available && (num_vars <= 3)) {
7564       retval = atomic_reduce_block;
7565     } else if (tree_available) {
7566       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
7567           (reduce_size < (2000 * sizeof(kmp_real64)))) {
7568         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7569       }
7570     } // otherwise: use critical section
7571 
7572 #else
7573 #error "Unknown or unsupported OS"
7574 #endif
7575 
7576 #else
7577 #error "Unknown or unsupported architecture"
7578 #endif
7579   }
7580 
7581   // KMP_FORCE_REDUCTION
7582 
7583   // If the team is serialized (team_size == 1), ignore the forced reduction
7584   // method and stay with the unsynchronized method (empty_reduce_block)
7585   if (__kmp_force_reduction_method != reduction_method_not_defined &&
7586       team_size != 1) {
7587 
7588     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7589 
7590     int atomic_available, tree_available;
7591 
7592     switch ((forced_retval = __kmp_force_reduction_method)) {
7593     case critical_reduce_block:
7594       KMP_ASSERT(lck); // lck should be != 0
7595       break;
7596 
7597     case atomic_reduce_block:
7598       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7599       if (!atomic_available) {
7600         KMP_WARNING(RedMethodNotSupported, "atomic");
7601         forced_retval = critical_reduce_block;
7602       }
7603       break;
7604 
7605     case tree_reduce_block:
7606       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7607       if (!tree_available) {
7608         KMP_WARNING(RedMethodNotSupported, "tree");
7609         forced_retval = critical_reduce_block;
7610       } else {
7611 #if KMP_FAST_REDUCTION_BARRIER
7612         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7613 #endif
7614       }
7615       break;
7616 
7617     default:
7618       KMP_ASSERT(0); // "unsupported method specified"
7619     }
7620 
7621     retval = forced_retval;
7622   }
7623 
7624   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
7625 
7626 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7627 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7628 
7629   return (retval);
7630 }
7631 
7632 // this function is for testing set/get/determine reduce method
7633 kmp_int32 __kmp_get_reduce_method(void) {
7634   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
7635 }
7636