1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_affinity.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
45 #endif /* defined(KMP_GOMP_COMPAT) */
46 
47 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
48 #if OMP_45_ENABLED
49     "4.5 (201511)";
50 #elif OMP_40_ENABLED
51     "4.0 (201307)";
52 #else
53     "3.1 (201107)";
54 #endif
55 
56 #ifdef KMP_DEBUG
57 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
58 #endif /* KMP_DEBUG */
59 
60 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
61 
62 /* ------------------------------------------------------------------------ */
63 /* ------------------------------------------------------------------------ */
64 
65 kmp_info_t __kmp_monitor;
66 
67 /* ------------------------------------------------------------------------ */
68 /* ------------------------------------------------------------------------ */
69 
70 /* Forward declarations */
71 
72 void __kmp_cleanup( void );
73 
74 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
75 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
76 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
77 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
78 #endif
79 static void __kmp_do_serial_initialize( void );
80 void __kmp_fork_barrier( int gtid, int tid );
81 void __kmp_join_barrier( int gtid );
82 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
86 #endif
87 
88 static int __kmp_expand_threads(int nWish, int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread( int gtid );
91 #endif
92 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
93 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
94 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* ------------------------------------------------------------------------ */
97 /* ------------------------------------------------------------------------ */
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique */
101 /* identifier of executing thread.                */
102 /* returns KMP_GTID_DNE if we haven't been assigned a gtid   */
103 
104 int
105 __kmp_get_global_thread_id( )
106 {
107     int i;
108     kmp_info_t   **other_threads;
109     size_t         stack_data;
110     char          *stack_addr;
111     size_t         stack_size;
112     char          *stack_base;
113 
114     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
115                       __kmp_nth, __kmp_all_nth ));
116 
117     /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
118              parallel region, made it return KMP_GTID_DNE to force serial_initialize by
119              caller.  Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
120              __kmp_init_gtid for this to work.  */
121 
122     if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
123 
124 #ifdef KMP_TDATA_GTID
125     if ( TCR_4(__kmp_gtid_mode) >= 3) {
126         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
127         return __kmp_gtid;
128     }
129 #endif
130     if ( TCR_4(__kmp_gtid_mode) >= 2) {
131         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
132         return __kmp_gtid_get_specific();
133     }
134     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
135 
136     stack_addr    = (char*) & stack_data;
137     other_threads = __kmp_threads;
138 
139     /*
140         ATT: The code below is a source of potential bugs due to unsynchronized access to
141         __kmp_threads array. For example:
142             1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
143             2. Current thread is suspended by OS.
144             3. Another thread unregisters and finishes (debug versions of free() may fill memory
145                with something like 0xEF).
146             4. Current thread is resumed.
147             5. Current thread reads junk from *thr.
148         TODO: Fix it.
149         --ln
150     */
151 
152     for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
153 
154         kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
155         if( !thr ) continue;
156 
157         stack_size =  (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
158         stack_base =  (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
159 
160         /* stack grows down -- search through all of the active threads */
161 
162         if( stack_addr <= stack_base ) {
163             size_t stack_diff = stack_base - stack_addr;
164 
165             if( stack_diff <= stack_size ) {
166                 /* The only way we can be closer than the allocated */
167                 /* stack size is if we are running on this thread. */
168                 KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
169                 return i;
170             }
171         }
172     }
173 
174     /* get specific to try and determine our gtid */
175     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
176                       "thread, using TLS\n" ));
177     i = __kmp_gtid_get_specific();
178 
179     /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
180 
181     /* if we havn't been assigned a gtid, then return code */
182     if( i<0 ) return i;
183 
184     /* dynamically updated stack window for uber threads to avoid get_specific call */
185     if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
186         KMP_FATAL( StackOverflow, i );
187     }
188 
189     stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
190     if( stack_addr > stack_base ) {
191         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
192         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
193           other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
194     } else {
195         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
196     }
197 
198     /* Reprint stack bounds for ubermaster since they have been refined */
199     if ( __kmp_storage_map ) {
200         char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
201         char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202         __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
203                                       other_threads[i]->th.th_info.ds.ds_stacksize,
204                                       "th_%d stack (refinement)", i );
205     }
206     return i;
207 }
208 
209 int
210 __kmp_get_global_thread_id_reg( )
211 {
212     int gtid;
213 
214     if ( !__kmp_init_serial ) {
215         gtid = KMP_GTID_DNE;
216     } else
217 #ifdef KMP_TDATA_GTID
218     if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
219         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
220         gtid = __kmp_gtid;
221     } else
222 #endif
223     if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
224         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
225         gtid = __kmp_gtid_get_specific();
226     } else {
227         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
228         gtid = __kmp_get_global_thread_id();
229     }
230 
231     /* we must be a new uber master sibling thread */
232     if( gtid == KMP_GTID_DNE ) {
233         KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
234                         "Registering a new gtid.\n" ));
235         __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
236         if( !__kmp_init_serial ) {
237             __kmp_do_serial_initialize();
238             gtid = __kmp_gtid_get_specific();
239         } else {
240             gtid = __kmp_register_root(FALSE);
241         }
242         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
243         /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244     }
245 
246     KMP_DEBUG_ASSERT( gtid >=0 );
247 
248     return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void
253 __kmp_check_stack_overlap( kmp_info_t *th )
254 {
255     int f;
256     char *stack_beg = NULL;
257     char *stack_end = NULL;
258     int gtid;
259 
260     KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
261     if ( __kmp_storage_map ) {
262         stack_end = (char *) th->th.th_info.ds.ds_stackbase;
263         stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
264 
265         gtid = __kmp_gtid_from_thread( th );
266 
267         if (gtid == KMP_GTID_MONITOR) {
268             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
269                                      "th_%s stack (%s)", "mon",
270                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
271         } else {
272             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273                                      "th_%d stack (%s)", gtid,
274                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
275         }
276     }
277 
278     /* No point in checking ubermaster threads since they use refinement and cannot overlap */
279     gtid = __kmp_gtid_from_thread( th );
280     if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
281     {
282         KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
283         if ( stack_beg == NULL ) {
284             stack_end = (char *) th->th.th_info.ds.ds_stackbase;
285             stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
286         }
287 
288         for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
289             kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
290 
291             if( f_th && f_th != th ) {
292                 char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
293                 char *other_stack_beg = other_stack_end -
294                                         (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
295                 if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
296                    (stack_end > other_stack_beg && stack_end < other_stack_end)) {
297 
298                     /* Print the other stack values before the abort */
299                     if ( __kmp_storage_map )
300                         __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
301                             (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
302                             "th_%d stack (overlapped)",
303                                                  __kmp_gtid_from_thread( f_th ) );
304 
305                     __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
306                 }
307             }
308         }
309     }
310     KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
311 }
312 
313 
314 /* ------------------------------------------------------------------------ */
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void
319 __kmp_infinite_loop( void )
320 {
321     static int done = FALSE;
322 
323     while (! done) {
324         KMP_YIELD( 1 );
325     }
326 }
327 
328 #define MAX_MESSAGE     512
329 
330 void
331 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
332     char buffer[MAX_MESSAGE];
333     va_list ap;
334 
335     va_start( ap, format);
336     KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
337     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
338     __kmp_vprintf( kmp_err, buffer, ap );
339 #if KMP_PRINT_DATA_PLACEMENT
340     int node;
341     if(gtid >= 0) {
342         if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
343             if( __kmp_storage_map_verbose ) {
344                 node = __kmp_get_host_node(p1);
345                 if(node < 0)  /* doesn't work, so don't try this next time */
346                     __kmp_storage_map_verbose = FALSE;
347                 else {
348                     char *last;
349                     int lastNode;
350                     int localProc = __kmp_get_cpu_from_gtid(gtid);
351 
352                     const int page_size = KMP_GET_PAGE_SIZE();
353 
354                     p1 = (void *)( (size_t)p1 & ~((size_t)page_size - 1) );
355                     p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)page_size - 1) );
356                     if(localProc >= 0)
357                         __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid, localProc>>1);
358                     else
359                         __kmp_printf_no_lock("  GTID %d\n", gtid);
360 # if KMP_USE_PRCTL
361 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
362                     do {
363                         last = p1;
364                         lastNode = node;
365                         /* This loop collates adjacent pages with the same host node. */
366                         do {
367                             (char*)p1 += page_size;
368                         } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
369                         __kmp_printf_no_lock("    %p-%p memNode %d\n", last,
370                                              (char*)p1 - 1, lastNode);
371                     } while(p1 <= p2);
372 # else
373                     __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
374                                          (char*)p1 + (page_size - 1), __kmp_get_host_node(p1));
375                     if(p1 < p2)  {
376                         __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
377                                              (char*)p2 + (page_size - 1), __kmp_get_host_node(p2));
378                     }
379 # endif
380                 }
381             }
382         } else
383             __kmp_printf_no_lock("  %s\n", KMP_I18N_STR( StorageMapWarning ) );
384     }
385 #endif /* KMP_PRINT_DATA_PLACEMENT */
386     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
387 }
388 
389 void
390 __kmp_warn( char const * format, ... )
391 {
392     char buffer[MAX_MESSAGE];
393     va_list ap;
394 
395     if ( __kmp_generate_warnings == kmp_warnings_off ) {
396         return;
397     }
398 
399     va_start( ap, format );
400 
401     KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
402     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
403     __kmp_vprintf( kmp_err, buffer, ap );
404     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
405 
406     va_end( ap );
407 }
408 
409 void
410 __kmp_abort_process()
411 {
412 
413     // Later threads may stall here, but that's ok because abort() will kill them.
414     __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
415 
416     if ( __kmp_debug_buf ) {
417         __kmp_dump_debug_buffer();
418     }; // if
419 
420     if ( KMP_OS_WINDOWS ) {
421         // Let other threads know of abnormal termination and prevent deadlock
422         // if abort happened during library initialization or shutdown
423         __kmp_global.g.g_abort = SIGABRT;
424 
425         /*
426             On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
427             Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
428             works well, but this function is not available in VS7 (this is not problem for DLL, but
429             it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
430             not help, at least in some versions of MS C RTL.
431 
432             It seems following sequence is the only way to simulate abort() and avoid pop-up error
433             box.
434         */
435         raise( SIGABRT );
436         _exit( 3 );    // Just in case, if signal ignored, exit anyway.
437     } else {
438         abort();
439     }; // if
440 
441     __kmp_infinite_loop();
442     __kmp_release_bootstrap_lock( & __kmp_exit_lock );
443 
444 } // __kmp_abort_process
445 
446 void
447 __kmp_abort_thread( void )
448 {
449     // TODO: Eliminate g_abort global variable and this function.
450     // In case of abort just call abort(), it will kill all the threads.
451     __kmp_infinite_loop();
452 } // __kmp_abort_thread
453 
454 /* ------------------------------------------------------------------------ */
455 
456 /*
457  * Print out the storage map for the major kmp_info_t thread data structures
458  * that are allocated together.
459  */
460 
461 static void
462 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
463 {
464     __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
465 
466     __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
467                              "th_%d.th_info", gtid );
468 
469     __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
470                              "th_%d.th_local", gtid );
471 
472     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473                              sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
474 
475     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
476                              &thr->th.th_bar[bs_plain_barrier+1],
477                              sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
478 
479     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480                              &thr->th.th_bar[bs_forkjoin_barrier+1],
481                              sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
482 
483     #if KMP_FAST_REDUCTION_BARRIER
484         __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
485                              &thr->th.th_bar[bs_reduction_barrier+1],
486                              sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
487     #endif // KMP_FAST_REDUCTION_BARRIER
488 }
489 
490 /*
491  * Print out the storage map for the major kmp_team_t team data structures
492  * that are allocated together.
493  */
494 
495 static void
496 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
497 {
498     int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499     __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                              header, team_id );
501 
502     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
503                              sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
504 
505 
506     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
507                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
508 
509     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
510                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
511 
512     #if KMP_FAST_REDUCTION_BARRIER
513         __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
514                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
515     #endif // KMP_FAST_REDUCTION_BARRIER
516 
517     __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
518                              sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
519 
520     __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521                              sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
522 
523     __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
524                              sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
525                              header, team_id );
526 
527 
528     __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
529                              sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
530 }
531 
532 static void __kmp_init_allocator() {}
533 static void __kmp_fini_allocator() {}
534 
535 /* ------------------------------------------------------------------------ */
536 
537 #ifdef KMP_DYNAMIC_LIB
538 # if KMP_OS_WINDOWS
539 
540 static void
541 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
542     // TODO: Change to __kmp_break_bootstrap_lock().
543     __kmp_init_bootstrap_lock( lck ); // make the lock released
544 }
545 
546 static void
547 __kmp_reset_locks_on_process_detach( int gtid_req ) {
548     int i;
549     int thread_count;
550 
551     // PROCESS_DETACH is expected to be called by a thread
552     // that executes ProcessExit() or FreeLibrary().
553     // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
554     // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
555     // However, in fact, some threads can be still alive here, although being about to be terminated.
556     // The threads in the array with ds_thread==0 are most suspicious.
557     // Actually, it can be not safe to access the __kmp_threads[].
558 
559     // TODO: does it make sense to check __kmp_roots[] ?
560 
561     // Let's check that there are no other alive threads registered with the OMP lib.
562     while( 1 ) {
563         thread_count = 0;
564         for( i = 0; i < __kmp_threads_capacity; ++i ) {
565             if( !__kmp_threads ) continue;
566             kmp_info_t* th = __kmp_threads[ i ];
567             if( th == NULL ) continue;
568             int gtid = th->th.th_info.ds.ds_gtid;
569             if( gtid == gtid_req ) continue;
570             if( gtid < 0 ) continue;
571             DWORD exit_val;
572             int alive = __kmp_is_thread_alive( th, &exit_val );
573             if( alive ) {
574             ++thread_count;
575             }
576         }
577         if( thread_count == 0 ) break; // success
578     }
579 
580     // Assume that I'm alone.
581 
582     // Now it might be probably safe to check and reset locks.
583     // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
584     __kmp_reset_lock( &__kmp_forkjoin_lock );
585     #ifdef KMP_DEBUG
586     __kmp_reset_lock( &__kmp_stdio_lock );
587     #endif // KMP_DEBUG
588 }
589 
590 BOOL WINAPI
591 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
592     //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
593 
594     switch( fdwReason ) {
595 
596         case DLL_PROCESS_ATTACH:
597             KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
598 
599             return TRUE;
600 
601         case DLL_PROCESS_DETACH:
602             KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
603                         __kmp_gtid_get_specific() ));
604 
605             if( lpReserved != NULL )
606             {
607                 // lpReserved is used for telling the difference:
608                 //  lpReserved == NULL when FreeLibrary() was called,
609                 //  lpReserved != NULL when the process terminates.
610                 // When FreeLibrary() is called, worker threads remain alive.
611                 // So they will release the forkjoin lock by themselves.
612                 // When the process terminates, worker threads disappear triggering
613                 // the problem of unreleased forkjoin lock as described below.
614 
615                 // A worker thread can take the forkjoin lock.
616                 // The problem comes up if that worker thread becomes dead
617                 // before it releases the forkjoin lock.
618                 // The forkjoin lock remains taken, while the thread
619                 // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
620                 // will try to take the forkjoin lock and will always fail,
621                 // so that the application will never finish [normally].
622                 // This scenario is possible if __kmpc_end() has not been executed.
623                 // It looks like it's not a corner case, but common cases:
624                 // - the main function was compiled by an alternative compiler;
625                 // - the main function was compiled by icl but without /Qopenmp (application with plugins);
626                 // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
627                 // - alive foreign thread prevented __kmpc_end from doing cleanup.
628 
629                 // This is a hack to work around the problem.
630                 // TODO: !!! to figure out something better.
631                 __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
632             }
633 
634             __kmp_internal_end_library( __kmp_gtid_get_specific() );
635 
636             return TRUE;
637 
638         case DLL_THREAD_ATTACH:
639             KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
640 
641             /* if we wanted to register new siblings all the time here call
642              * __kmp_get_gtid(); */
643             return TRUE;
644 
645         case DLL_THREAD_DETACH:
646             KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
647                         __kmp_gtid_get_specific() ));
648 
649             __kmp_internal_end_thread( __kmp_gtid_get_specific() );
650             return TRUE;
651     }
652 
653     return TRUE;
654 }
655 
656 # endif /* KMP_OS_WINDOWS */
657 #endif /* KMP_DYNAMIC_LIB */
658 
659 
660 /* ------------------------------------------------------------------------ */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int
665 __kmp_change_library( int status )
666 {
667     int old_status;
668 
669     old_status = __kmp_yield_init & 1;  // check whether KMP_LIBRARY=throughput (even init count)
670 
671     if (status) {
672         __kmp_yield_init |= 1;  // throughput => turnaround (odd init count)
673     }
674     else {
675         __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
676     }
677 
678     return old_status;  // return previous setting of whether KMP_LIBRARY=throughput
679 }
680 
681 /* ------------------------------------------------------------------------ */
682 /* ------------------------------------------------------------------------ */
683 
684 /* __kmp_parallel_deo --
685  * Wait until it's our turn.
686  */
687 void
688 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
689 {
690     int gtid = *gtid_ref;
691 #ifdef BUILD_PARALLEL_ORDERED
692     kmp_team_t *team = __kmp_team_from_gtid( gtid );
693 #endif /* BUILD_PARALLEL_ORDERED */
694 
695     if( __kmp_env_consistency_check ) {
696         if( __kmp_threads[gtid]->th.th_root->r.r_active )
697 #if KMP_USE_DYNAMIC_LOCK
698             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
699 #else
700             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
701 #endif
702     }
703 #ifdef BUILD_PARALLEL_ORDERED
704     if( !team->t.t_serialized ) {
705         KMP_MB();
706         KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
707         KMP_MB();
708     }
709 #endif /* BUILD_PARALLEL_ORDERED */
710 }
711 
712 /* __kmp_parallel_dxo --
713  * Signal the next task.
714  */
715 
716 void
717 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
718 {
719     int gtid = *gtid_ref;
720 #ifdef BUILD_PARALLEL_ORDERED
721     int tid =  __kmp_tid_from_gtid( gtid );
722     kmp_team_t *team = __kmp_team_from_gtid( gtid );
723 #endif /* BUILD_PARALLEL_ORDERED */
724 
725     if( __kmp_env_consistency_check ) {
726         if( __kmp_threads[gtid]->th.th_root->r.r_active )
727             __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
728     }
729 #ifdef BUILD_PARALLEL_ORDERED
730     if ( ! team->t.t_serialized ) {
731         KMP_MB();       /* Flush all pending memory write invalidates.  */
732 
733         /* use the tid of the next thread in this team */
734         /* TODO repleace with general release procedure */
735         team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
736 
737 #if OMPT_SUPPORT && OMPT_BLAME
738         if (ompt_enabled &&
739             ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
740             /* accept blame for "ordered" waiting */
741             kmp_info_t *this_thread = __kmp_threads[gtid];
742             ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
743                 this_thread->th.ompt_thread_info.wait_id);
744         }
745 #endif
746 
747         KMP_MB();       /* Flush all pending memory write invalidates.  */
748     }
749 #endif /* BUILD_PARALLEL_ORDERED */
750 }
751 
752 /* ------------------------------------------------------------------------ */
753 /* ------------------------------------------------------------------------ */
754 
755 /* ------------------------------------------------------------------------ */
756 /* ------------------------------------------------------------------------ */
757 
758 /* The BARRIER for a SINGLE process section is always explicit   */
759 
760 int
761 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
762 {
763     int status;
764     kmp_info_t *th;
765     kmp_team_t *team;
766 
767     if( ! TCR_4(__kmp_init_parallel) )
768         __kmp_parallel_initialize();
769 
770     th   = __kmp_threads[ gtid ];
771     team = th->th.th_team;
772     status = 0;
773 
774     th->th.th_ident = id_ref;
775 
776     if ( team->t.t_serialized ) {
777         status = 1;
778     } else {
779         kmp_int32 old_this = th->th.th_local.this_construct;
780 
781         ++th->th.th_local.this_construct;
782         /* try to set team count to thread count--success means thread got the
783            single block
784         */
785         /* TODO: Should this be acquire or release? */
786         if (team->t.t_construct == old_this) {
787             status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
788                                                  th->th.th_local.this_construct);
789         }
790 #if USE_ITT_BUILD
791         if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
792 #if OMP_40_ENABLED
793             th->th.th_teams_microtask == NULL &&
794 #endif
795             team->t.t_active_level == 1 )
796         {   // Only report metadata by master of active team at level 1
797             __kmp_itt_metadata_single( id_ref );
798         }
799 #endif /* USE_ITT_BUILD */
800     }
801 
802     if( __kmp_env_consistency_check ) {
803         if (status && push_ws) {
804             __kmp_push_workshare( gtid, ct_psingle, id_ref );
805         } else {
806             __kmp_check_workshare( gtid, ct_psingle, id_ref );
807         }
808     }
809 #if USE_ITT_BUILD
810     if ( status ) {
811         __kmp_itt_single_start( gtid );
812     }
813 #endif /* USE_ITT_BUILD */
814     return status;
815 }
816 
817 void
818 __kmp_exit_single( int gtid )
819 {
820 #if USE_ITT_BUILD
821     __kmp_itt_single_end( gtid );
822 #endif /* USE_ITT_BUILD */
823     if( __kmp_env_consistency_check )
824         __kmp_pop_workshare( gtid, ct_psingle, NULL );
825 }
826 
827 
828 /*
829  * determine if we can go parallel or must use a serialized parallel region and
830  * how many threads we can use
831  * set_nproc is the number of threads requested for the team
832  * returns 0 if we should serialize or only use one thread,
833  * otherwise the number of threads to use
834  * The forkjoin lock is held by the caller.
835  */
836 static int
837 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
838    int master_tid, int set_nthreads
839 #if OMP_40_ENABLED
840   , int enter_teams
841 #endif /* OMP_40_ENABLED */
842 )
843 {
844     int capacity;
845     int new_nthreads;
846     KMP_DEBUG_ASSERT( __kmp_init_serial );
847     KMP_DEBUG_ASSERT( root && parent_team );
848 
849     //
850     // If dyn-var is set, dynamically adjust the number of desired threads,
851     // according to the method specified by dynamic_mode.
852     //
853     new_nthreads = set_nthreads;
854     if ( ! get__dynamic_2( parent_team, master_tid ) ) {
855         ;
856     }
857 #ifdef USE_LOAD_BALANCE
858     else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
859         new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
860         if ( new_nthreads == 1 ) {
861             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
862               master_tid ));
863             return 1;
864         }
865         if ( new_nthreads < set_nthreads ) {
866             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
867               master_tid, new_nthreads ));
868         }
869     }
870 #endif /* USE_LOAD_BALANCE */
871     else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
872         new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
873           : root->r.r_hot_team->t.t_nproc);
874         if ( new_nthreads <= 1 ) {
875             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
876               master_tid ));
877             return 1;
878         }
879         if ( new_nthreads < set_nthreads ) {
880             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
881               master_tid, new_nthreads ));
882         }
883         else {
884             new_nthreads = set_nthreads;
885         }
886     }
887     else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
888         if ( set_nthreads > 2 ) {
889             new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
890             new_nthreads = ( new_nthreads % set_nthreads ) + 1;
891             if ( new_nthreads == 1 ) {
892                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
893                   master_tid ));
894                 return 1;
895             }
896             if ( new_nthreads < set_nthreads ) {
897                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
898                   master_tid, new_nthreads ));
899             }
900         }
901     }
902     else {
903         KMP_ASSERT( 0 );
904     }
905 
906     //
907     // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
908     //
909     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
910       root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
911         int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
912           root->r.r_hot_team->t.t_nproc );
913         if ( tl_nthreads <= 0 ) {
914             tl_nthreads = 1;
915         }
916 
917         //
918         // If dyn-var is false, emit a 1-time warning.
919         //
920         if ( ! get__dynamic_2( parent_team, master_tid )
921           && ( ! __kmp_reserve_warn ) ) {
922             __kmp_reserve_warn = 1;
923             __kmp_msg(
924                 kmp_ms_warning,
925                 KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
926                 KMP_HNT( Unset_ALL_THREADS ),
927                 __kmp_msg_null
928             );
929         }
930         if ( tl_nthreads == 1 ) {
931             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
932               master_tid ));
933             return 1;
934         }
935         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
936           master_tid, tl_nthreads ));
937         new_nthreads = tl_nthreads;
938     }
939 
940     //
941     // Check if the threads array is large enough, or needs expanding.
942     //
943     // See comment in __kmp_register_root() about the adjustment if
944     // __kmp_threads[0] == NULL.
945     //
946     capacity = __kmp_threads_capacity;
947     if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
948         --capacity;
949     }
950     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
951       root->r.r_hot_team->t.t_nproc ) > capacity ) {
952         //
953         // Expand the threads array.
954         //
955         int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
956           root->r.r_hot_team->t.t_nproc ) - capacity;
957         int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
958         if ( slotsAdded < slotsRequired ) {
959             //
960             // The threads array was not expanded enough.
961             //
962             new_nthreads -= ( slotsRequired - slotsAdded );
963             KMP_ASSERT( new_nthreads >= 1 );
964 
965             //
966             // If dyn-var is false, emit a 1-time warning.
967             //
968             if ( ! get__dynamic_2( parent_team, master_tid )
969               && ( ! __kmp_reserve_warn ) ) {
970                 __kmp_reserve_warn = 1;
971                 if ( __kmp_tp_cached ) {
972                     __kmp_msg(
973                         kmp_ms_warning,
974                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
975                         KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
976                         KMP_HNT( PossibleSystemLimitOnThreads ),
977                         __kmp_msg_null
978                     );
979                 }
980                 else {
981                     __kmp_msg(
982                         kmp_ms_warning,
983                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
984                         KMP_HNT( SystemLimitOnThreads ),
985                         __kmp_msg_null
986                     );
987                 }
988             }
989         }
990     }
991 
992     if ( new_nthreads == 1 ) {
993         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
994                         __kmp_get_gtid(), set_nthreads ) );
995         return 1;
996     }
997 
998     KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
999                     __kmp_get_gtid(), new_nthreads, set_nthreads ));
1000     return new_nthreads;
1001 }
1002 
1003 /* ------------------------------------------------------------------------ */
1004 /* ------------------------------------------------------------------------ */
1005 
1006 /* allocate threads from the thread pool and assign them to the new team */
1007 /* we are assured that there are enough threads available, because we
1008  * checked on that earlier within critical section forkjoin */
1009 
1010 static void
1011 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1012                          kmp_info_t *master_th, int master_gtid )
1013 {
1014     int         i;
1015     int use_hot_team;
1016 
1017     KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1018     KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1019     KMP_MB();
1020 
1021     /* first, let's setup the master thread */
1022     master_th->th.th_info.ds.ds_tid  = 0;
1023     master_th->th.th_team            = team;
1024     master_th->th.th_team_nproc      = team->t.t_nproc;
1025     master_th->th.th_team_master     = master_th;
1026     master_th->th.th_team_serialized = FALSE;
1027     master_th->th.th_dispatch        = & team->t.t_dispatch[ 0 ];
1028 
1029     /* make sure we are not the optimized hot team */
1030 #if KMP_NESTED_HOT_TEAMS
1031     use_hot_team = 0;
1032     kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1033     if( hot_teams ) {  // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1034         int level = team->t.t_active_level - 1;    // index in array of hot teams
1035         if( master_th->th.th_teams_microtask ) {    // are we inside the teams?
1036             if( master_th->th.th_teams_size.nteams > 1 ) {
1037                 ++level; // level was not increased in teams construct for team_of_masters
1038             }
1039             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1040                 master_th->th.th_teams_level == team->t.t_level ) {
1041                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
1042             }            // team->t.t_level will be increased inside parallel
1043         }
1044         if( level < __kmp_hot_teams_max_level ) {
1045             if( hot_teams[level].hot_team ) {
1046                 // hot team has already been allocated for given level
1047                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1048                 use_hot_team = 1; // the team is ready to use
1049             } else {
1050                 use_hot_team = 0; // AC: threads are not allocated yet
1051                 hot_teams[level].hot_team = team; // remember new hot team
1052                 hot_teams[level].hot_team_nth = team->t.t_nproc;
1053             }
1054         } else {
1055             use_hot_team = 0;
1056         }
1057     }
1058 #else
1059     use_hot_team = team == root->r.r_hot_team;
1060 #endif
1061     if ( !use_hot_team ) {
1062 
1063         /* install the master thread */
1064         team->t.t_threads[ 0 ]    = master_th;
1065         __kmp_initialize_info( master_th, team, 0, master_gtid );
1066 
1067         /* now, install the worker threads */
1068         for ( i=1 ;  i < team->t.t_nproc ; i++ ) {
1069 
1070             /* fork or reallocate a new thread and install it in team */
1071             kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1072             team->t.t_threads[ i ] = thr;
1073             KMP_DEBUG_ASSERT( thr );
1074             KMP_DEBUG_ASSERT( thr->th.th_team == team );
1075             /* align team and thread arrived states */
1076             KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1077                             __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1078                             __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1079                             team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1080                             team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1081 #if OMP_40_ENABLED
1082             thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1083             thr->th.th_teams_level     = master_th->th.th_teams_level;
1084             thr->th.th_teams_size      = master_th->th.th_teams_size;
1085 #endif
1086             { // Initialize threads' barrier data.
1087                 int b;
1088                 kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1089                 for ( b = 0; b < bs_last_barrier; ++ b ) {
1090                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
1091                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1092 #if USE_DEBUGGER
1093                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1094 #endif
1095                 }; // for b
1096             }
1097         }
1098 
1099 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1100         __kmp_partition_places( team );
1101 #endif
1102 
1103     }
1104 
1105     KMP_MB();
1106 }
1107 
1108 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1109 //
1110 // Propagate any changes to the floating point control registers out to the team
1111 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1112 // so we don't make changes unless they are needed.
1113 //
1114 inline static void
1115 propagateFPControl(kmp_team_t * team)
1116 {
1117     if ( __kmp_inherit_fp_control ) {
1118         kmp_int16 x87_fpu_control_word;
1119         kmp_uint32 mxcsr;
1120 
1121         // Get master values of FPU control flags (both X87 and vector)
1122         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1123         __kmp_store_mxcsr( &mxcsr );
1124         mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126         // There is no point looking at t_fp_control_saved here.
1127         // If it is TRUE, we still have to update the values if they are different from those we now have.
1128         // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1129         // that the values in the team are the same as those we have.
1130         // So, this code achieves what we need whether or not t_fp_control_saved is true.
1131         // By checking whether the value needs updating we avoid unnecessary writes that would put the
1132         // cache-line into a written state, causing all threads in the team to have to read it again.
1133         KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1134         KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1135         // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1136         // So we must ensure it is correct.
1137         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1138     }
1139     else {
1140         // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1141         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1142     }
1143 }
1144 
1145 // Do the opposite, setting the hardware registers to the updated values from the team.
1146 inline static void
1147 updateHWFPControl(kmp_team_t * team)
1148 {
1149     if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1150         //
1151         // Only reset the fp control regs if they have been changed in the team.
1152         // the parallel region that we are exiting.
1153         //
1154         kmp_int16 x87_fpu_control_word;
1155         kmp_uint32 mxcsr;
1156         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1157         __kmp_store_mxcsr( &mxcsr );
1158         mxcsr &= KMP_X86_MXCSR_MASK;
1159 
1160         if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1161             __kmp_clear_x87_fpu_status_word();
1162             __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1163         }
1164 
1165         if ( team->t.t_mxcsr != mxcsr ) {
1166             __kmp_load_mxcsr( &team->t.t_mxcsr );
1167         }
1168     }
1169 }
1170 #else
1171 # define propagateFPControl(x) ((void)0)
1172 # define updateHWFPControl(x)  ((void)0)
1173 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1174 
1175 static void
1176 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1177 
1178 /*
1179  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1180  */
1181 void
1182 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1183 {
1184     kmp_info_t *this_thr;
1185     kmp_team_t *serial_team;
1186 
1187     KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1188 
1189     /* Skip all this code for autopar serialized loops since it results in
1190        unacceptable overhead */
1191     if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1192         return;
1193 
1194     if( ! TCR_4( __kmp_init_parallel ) )
1195         __kmp_parallel_initialize();
1196 
1197     this_thr     = __kmp_threads[ global_tid ];
1198     serial_team  = this_thr->th.th_serial_team;
1199 
1200     /* utilize the serialized team held by this thread */
1201     KMP_DEBUG_ASSERT( serial_team );
1202     KMP_MB();
1203 
1204     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1205         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1206         KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1207         KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1208                         global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1209         this_thr->th.th_task_team = NULL;
1210     }
1211 
1212 #if OMP_40_ENABLED
1213     kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1214     if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1215         proc_bind = proc_bind_false;
1216     }
1217     else if ( proc_bind == proc_bind_default ) {
1218         //
1219         // No proc_bind clause was specified, so use the current value
1220         // of proc-bind-var for this parallel region.
1221         //
1222         proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1223     }
1224     //
1225     // Reset for next parallel region
1226     //
1227     this_thr->th.th_set_proc_bind = proc_bind_default;
1228 #endif /* OMP_40_ENABLED */
1229 
1230     if( this_thr->th.th_team != serial_team ) {
1231         // Nested level will be an index in the nested nthreads array
1232         int level = this_thr->th.th_team->t.t_level;
1233 
1234         if( serial_team->t.t_serialized ) {
1235             /* this serial team was already used
1236              * TODO increase performance by making this locks more specific */
1237             kmp_team_t *new_team;
1238 
1239             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1240 
1241 #if OMPT_SUPPORT
1242             ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1243 #endif
1244 
1245             new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1246 #if OMPT_SUPPORT
1247                                            ompt_parallel_id,
1248 #endif
1249 #if OMP_40_ENABLED
1250                                            proc_bind,
1251 #endif
1252                                            & this_thr->th.th_current_task->td_icvs,
1253                                            0 USE_NESTED_HOT_ARG(NULL) );
1254             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1255             KMP_ASSERT( new_team );
1256 
1257             /* setup new serialized team and install it */
1258             new_team->t.t_threads[0] = this_thr;
1259             new_team->t.t_parent = this_thr->th.th_team;
1260             serial_team = new_team;
1261             this_thr->th.th_serial_team = serial_team;
1262 
1263             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1264                             global_tid, serial_team ) );
1265 
1266 
1267             /* TODO the above breaks the requirement that if we run out of
1268              * resources, then we can still guarantee that serialized teams
1269              * are ok, since we may need to allocate a new one */
1270         } else {
1271             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1272                             global_tid, serial_team ) );
1273         }
1274 
1275         /* we have to initialize this serial team */
1276         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1277         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1278         KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1279         serial_team->t.t_ident         = loc;
1280         serial_team->t.t_serialized    = 1;
1281         serial_team->t.t_nproc         = 1;
1282         serial_team->t.t_parent        = this_thr->th.th_team;
1283         serial_team->t.t_sched         = this_thr->th.th_team->t.t_sched;
1284         this_thr->th.th_team           = serial_team;
1285         serial_team->t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
1286 
1287         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1288                         global_tid, this_thr->th.th_current_task ) );
1289         KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1290         this_thr->th.th_current_task->td_flags.executing = 0;
1291 
1292         __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1293 
1294         /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
1295            each serialized task represented by team->t.t_serialized? */
1296         copy_icvs(
1297                   & this_thr->th.th_current_task->td_icvs,
1298                   & this_thr->th.th_current_task->td_parent->td_icvs );
1299 
1300         // Thread value exists in the nested nthreads array for the next nested level
1301         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1302             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1303         }
1304 
1305 #if OMP_40_ENABLED
1306         if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1307             this_thr->th.th_current_task->td_icvs.proc_bind
1308                 = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1309         }
1310 #endif /* OMP_40_ENABLED */
1311 
1312 #if USE_DEBUGGER
1313         serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1314 #endif
1315         this_thr->th.th_info.ds.ds_tid = 0;
1316 
1317         /* set thread cache values */
1318         this_thr->th.th_team_nproc     = 1;
1319         this_thr->th.th_team_master    = this_thr;
1320         this_thr->th.th_team_serialized = 1;
1321 
1322         serial_team->t.t_level        = serial_team->t.t_parent->t.t_level + 1;
1323         serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1324 
1325         propagateFPControl (serial_team);
1326 
1327         /* check if we need to allocate dispatch buffers stack */
1328         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1329         if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1330             serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1331                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1332         }
1333         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334 
1335 #if OMPT_SUPPORT
1336         ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1337         __ompt_team_assign_id(serial_team, ompt_parallel_id);
1338 #endif
1339 
1340         KMP_MB();
1341 
1342     } else {
1343         /* this serialized team is already being used,
1344          * that's fine, just add another nested level */
1345         KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1346         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1347         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1348         ++ serial_team->t.t_serialized;
1349         this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1350 
1351         // Nested level will be an index in the nested nthreads array
1352         int level = this_thr->th.th_team->t.t_level;
1353         // Thread value exists in the nested nthreads array for the next nested level
1354         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1355             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1356         }
1357         serial_team->t.t_level++;
1358         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1359                         global_tid, serial_team, serial_team->t.t_level ) );
1360 
1361         /* allocate/push dispatch buffers stack */
1362         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1363         {
1364             dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1365                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1366             disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1367             serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1368         }
1369         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1370 
1371         KMP_MB();
1372     }
1373 
1374     if ( __kmp_env_consistency_check )
1375         __kmp_push_parallel( global_tid, NULL );
1376 
1377 }
1378 
1379 /* most of the work for a fork */
1380 /* return true if we really went parallel, false if serialized */
1381 int
1382 __kmp_fork_call(
1383     ident_t   * loc,
1384     int         gtid,
1385     enum fork_context_e  call_context, // Intel, GNU, ...
1386     kmp_int32   argc,
1387 #if OMPT_SUPPORT
1388     void       *unwrapped_task,
1389 #endif
1390     microtask_t microtask,
1391     launch_t    invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394     va_list   * ap
1395 #else
1396     va_list     ap
1397 #endif
1398     )
1399 {
1400     void          **argv;
1401     int             i;
1402     int             master_tid;
1403     int             master_this_cons;
1404     kmp_team_t     *team;
1405     kmp_team_t     *parent_team;
1406     kmp_info_t     *master_th;
1407     kmp_root_t     *root;
1408     int             nthreads;
1409     int             master_active;
1410     int             master_set_numthreads;
1411     int             level;
1412 #if OMP_40_ENABLED
1413     int             active_level;
1414     int             teams_level;
1415 #endif
1416 #if KMP_NESTED_HOT_TEAMS
1417     kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419     { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1424     if ( __kmp_stkpadding > 0 &&  __kmp_root[gtid] != NULL ) {
1425         /* Some systems prefer the stack for the root thread(s) to start with */
1426         /* some gap from the parent stack to prevent false sharing. */
1427         void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428         /* These 2 lines below are so this does not get optimized out */
1429         if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1430             __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1435     if( ! TCR_4(__kmp_init_parallel) )
1436         __kmp_parallel_initialize();
1437 
1438     /* setup current data */
1439     master_th     = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1440     parent_team   = master_th->th.th_team;
1441     master_tid    = master_th->th.th_info.ds.ds_tid;
1442     master_this_cons = master_th->th.th_local.this_construct;
1443     root          = master_th->th.th_root;
1444     master_active = root->r.r_active;
1445     master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448     ompt_parallel_id_t ompt_parallel_id;
1449     ompt_task_id_t ompt_task_id;
1450     ompt_frame_t *ompt_frame;
1451     ompt_task_id_t my_task_id;
1452     ompt_parallel_id_t my_parallel_id;
1453 
1454     if (ompt_enabled) {
1455         ompt_parallel_id = __ompt_parallel_id_new(gtid);
1456         ompt_task_id = __ompt_get_task_id_internal(0);
1457         ompt_frame = __ompt_get_task_frame_internal(0);
1458     }
1459 #endif
1460 
1461     // Nested level will be an index in the nested nthreads array
1462     level         = parent_team->t.t_level;
1463     active_level  = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1464 #if OMP_40_ENABLED
1465     teams_level    = master_th->th.th_teams_level; // needed to check nesting inside the teams
1466 #endif
1467 #if KMP_NESTED_HOT_TEAMS
1468     p_hot_teams   = &master_th->th.th_hot_teams;
1469     if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1470         *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1471                 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472         (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473         (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1474     }
1475 #endif
1476 
1477 #if OMPT_SUPPORT
1478     if (ompt_enabled &&
1479         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1480         int team_size = master_set_numthreads;
1481 
1482         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1483             ompt_task_id, ompt_frame, ompt_parallel_id,
1484             team_size, unwrapped_task, OMPT_INVOKER(call_context));
1485     }
1486 #endif
1487 
1488     master_th->th.th_ident = loc;
1489 
1490 #if OMP_40_ENABLED
1491     if ( master_th->th.th_teams_microtask &&
1492          ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1493         // AC: This is start of parallel that is nested inside teams construct.
1494         //     The team is actual (hot), all workers are ready at the fork barrier.
1495         //     No lock needed to initialize the team a bit, then free workers.
1496         parent_team->t.t_ident = loc;
1497         __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1498         parent_team->t.t_argc  = argc;
1499         argv = (void**)parent_team->t.t_argv;
1500         for( i=argc-1; i >= 0; --i )
1501 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1502 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1503             *argv++ = va_arg( *ap, void * );
1504 #else
1505             *argv++ = va_arg( ap, void * );
1506 #endif
1507         /* Increment our nested depth levels, but not increase the serialization */
1508         if ( parent_team == master_th->th.th_serial_team ) {
1509             // AC: we are in serialized parallel
1510             __kmpc_serialized_parallel(loc, gtid);
1511             KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1512             parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1513                                            //     work correctly, will restore at join time
1514 
1515 #if OMPT_SUPPORT
1516             void *dummy;
1517             void **exit_runtime_p;
1518 
1519             ompt_lw_taskteam_t lw_taskteam;
1520 
1521             if (ompt_enabled) {
1522                 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1523                     unwrapped_task, ompt_parallel_id);
1524                 lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1525                 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1526 
1527                 __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1528 
1529 #if OMPT_TRACE
1530                 /* OMPT implicit task begin */
1531                 my_task_id = lw_taskteam.ompt_task_info.task_id;
1532                 my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1533                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1534                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1535                         my_parallel_id, my_task_id);
1536                 }
1537 #endif
1538 
1539                 /* OMPT state */
1540                 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1541             } else {
1542                 exit_runtime_p = &dummy;
1543             }
1544 #endif
1545 
1546             {
1547                 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1548                 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1549                 __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1550 #if OMPT_SUPPORT
1551                                         , exit_runtime_p
1552 #endif
1553                                         );
1554             }
1555 
1556 #if OMPT_SUPPORT
1557             *exit_runtime_p = NULL;
1558             if (ompt_enabled) {
1559 #if OMPT_TRACE
1560                 lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1561 
1562                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1563                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1564                         ompt_parallel_id, ompt_task_id);
1565                 }
1566 
1567                 __ompt_lw_taskteam_unlink(master_th);
1568                 // reset clear the task id only after unlinking the task
1569                 lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1570 #endif
1571 
1572                 if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1573                     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1574                         ompt_parallel_id, ompt_task_id,
1575                         OMPT_INVOKER(call_context));
1576                 }
1577                 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1578             }
1579 #endif
1580             return TRUE;
1581         }
1582 
1583         parent_team->t.t_pkfn  = microtask;
1584 #if OMPT_SUPPORT
1585         parent_team->t.ompt_team_info.microtask = unwrapped_task;
1586 #endif
1587         parent_team->t.t_invoke = invoker;
1588         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1589         parent_team->t.t_active_level ++;
1590         parent_team->t.t_level ++;
1591 
1592         /* Change number of threads in the team if requested */
1593         if ( master_set_numthreads ) {   // The parallel has num_threads clause
1594             if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1595                 // AC: only can reduce the number of threads dynamically, cannot increase
1596                 kmp_info_t **other_threads = parent_team->t.t_threads;
1597                 parent_team->t.t_nproc = master_set_numthreads;
1598                 for ( i = 0; i < master_set_numthreads; ++i ) {
1599                     other_threads[i]->th.th_team_nproc = master_set_numthreads;
1600                 }
1601                 // Keep extra threads hot in the team for possible next parallels
1602             }
1603             master_th->th.th_set_nproc = 0;
1604         }
1605 
1606 #if USE_DEBUGGER
1607     if ( __kmp_debugging ) {    // Let debugger override number of threads.
1608         int nth = __kmp_omp_num_threads( loc );
1609         if ( nth > 0 ) {        // 0 means debugger does not want to change number of threads.
1610             master_set_numthreads = nth;
1611         }; // if
1612     }; // if
1613 #endif
1614 
1615         KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1616         __kmp_internal_fork( loc, gtid, parent_team );
1617         KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1618 
1619         /* Invoke microtask for MASTER thread */
1620         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1621                     gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1622 
1623         {
1624             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1625             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1626             if (! parent_team->t.t_invoke( gtid )) {
1627                 KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1628             }
1629         }
1630         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1631             gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1632         KMP_MB();       /* Flush all pending memory write invalidates.  */
1633 
1634         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1635 
1636         return TRUE;
1637     } // Parallel closely nested in teams construct
1638 #endif /* OMP_40_ENABLED */
1639 
1640 #if KMP_DEBUG
1641     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1642         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1643     }
1644 #endif
1645 
1646     if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1647         nthreads = 1;
1648     } else {
1649 #if OMP_40_ENABLED
1650         int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1651 #endif
1652         nthreads = master_set_numthreads ?
1653             master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1654 
1655         // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1656         // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1657         if (nthreads > 1) {
1658             if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1659 #if OMP_40_ENABLED
1660                 && !enter_teams
1661 #endif /* OMP_40_ENABLED */
1662             ) ) || ( __kmp_library == library_serial ) ) {
1663                 KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1664                                 gtid, nthreads ));
1665                 nthreads = 1;
1666             }
1667         }
1668         if ( nthreads > 1 ) {
1669             /* determine how many new threads we can use */
1670             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1671 
1672             nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1673 #if OMP_40_ENABLED
1674 /* AC: If we execute teams from parallel region (on host), then teams should be created
1675    but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1676    then teams and their threads should be created regardless of the nesting setting. */
1677                                          , enter_teams
1678 #endif /* OMP_40_ENABLED */
1679                                          );
1680             if ( nthreads == 1 ) {
1681                 // Free lock for single thread execution here;
1682                 // for multi-thread execution it will be freed later
1683                 // after team of threads created and initialized
1684                 __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1685             }
1686         }
1687     }
1688     KMP_DEBUG_ASSERT( nthreads > 0 );
1689 
1690     /* If we temporarily changed the set number of threads then restore it now */
1691     master_th->th.th_set_nproc = 0;
1692 
1693     /* create a serialized parallel region? */
1694     if ( nthreads == 1 ) {
1695         /* josh todo: hypothetical question: what do we do for OS X*? */
1696 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1697         void *   args[ argc ];
1698 #else
1699         void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1700 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1701 
1702         KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1703 
1704         __kmpc_serialized_parallel(loc, gtid);
1705 
1706         if ( call_context == fork_context_intel ) {
1707             /* TODO this sucks, use the compiler itself to pass args! :) */
1708             master_th->th.th_serial_team->t.t_ident = loc;
1709 #if OMP_40_ENABLED
1710             if ( !ap ) {
1711                 // revert change made in __kmpc_serialized_parallel()
1712                 master_th->th.th_serial_team->t.t_level--;
1713                 // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716                 void *dummy;
1717                 void **exit_runtime_p;
1718 
1719                 ompt_lw_taskteam_t lw_taskteam;
1720 
1721                 if (ompt_enabled) {
1722                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1723                         unwrapped_task, ompt_parallel_id);
1724                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1725                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1726 
1727                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1728 
1729 #if OMPT_TRACE
1730                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1731                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1732                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1733                             ompt_parallel_id, my_task_id);
1734                     }
1735 #endif
1736 
1737                     /* OMPT state */
1738                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1739                 } else {
1740                     exit_runtime_p = &dummy;
1741                 }
1742 #endif
1743 
1744                 {
1745                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1746                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1747                     __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1748 #if OMPT_SUPPORT
1749                         , exit_runtime_p
1750 #endif
1751                     );
1752                 }
1753 
1754 #if OMPT_SUPPORT
1755                 *exit_runtime_p = NULL;
1756                 if (ompt_enabled) {
1757                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1758 
1759 #if OMPT_TRACE
1760                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1761                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1762                             ompt_parallel_id, ompt_task_id);
1763                     }
1764 #endif
1765 
1766                     __ompt_lw_taskteam_unlink(master_th);
1767                     // reset clear the task id only after unlinking the task
1768                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1769 
1770                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1771                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1772                             ompt_parallel_id, ompt_task_id,
1773                             OMPT_INVOKER(call_context));
1774                     }
1775                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1776                 }
1777 #endif
1778             } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1779                 KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1780                 team = master_th->th.th_team;
1781                 //team->t.t_pkfn = microtask;
1782                 team->t.t_invoke = invoker;
1783                 __kmp_alloc_argv_entries( argc, team, TRUE );
1784                 team->t.t_argc = argc;
1785                 argv = (void**) team->t.t_argv;
1786                 if ( ap ) {
1787                     for( i=argc-1; i >= 0; --i )
1788 // TODO: revert workaround for Intel(R) 64 tracker #96
1789 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1790                         *argv++ = va_arg( *ap, void * );
1791 # else
1792                         *argv++ = va_arg( ap, void * );
1793 # endif
1794                 } else {
1795                     for( i=0; i < argc; ++i )
1796                         // Get args from parent team for teams construct
1797                         argv[i] = parent_team->t.t_argv[i];
1798                 }
1799                 // AC: revert change made in __kmpc_serialized_parallel()
1800                 //     because initial code in teams should have level=0
1801                 team->t.t_level--;
1802                 // AC: call special invoker for outer "parallel" of the teams construct
1803                 {
1804                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1805                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1806                     invoker(gtid);
1807                 }
1808             } else {
1809 #endif /* OMP_40_ENABLED */
1810                 argv = args;
1811                 for( i=argc-1; i >= 0; --i )
1812 // TODO: revert workaround for Intel(R) 64 tracker #96
1813 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1814                     *argv++ = va_arg( *ap, void * );
1815 #else
1816                     *argv++ = va_arg( ap, void * );
1817 #endif
1818                 KMP_MB();
1819 
1820 #if OMPT_SUPPORT
1821                 void *dummy;
1822                 void **exit_runtime_p;
1823 
1824                 ompt_lw_taskteam_t lw_taskteam;
1825 
1826                 if (ompt_enabled) {
1827                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1828                         unwrapped_task, ompt_parallel_id);
1829                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1830                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1831 
1832                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1833 
1834 #if OMPT_TRACE
1835                     /* OMPT implicit task begin */
1836                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1837                     my_parallel_id = ompt_parallel_id;
1838                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1839                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1840                             my_parallel_id, my_task_id);
1841                     }
1842 #endif
1843 
1844                     /* OMPT state */
1845                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1846                 } else {
1847                     exit_runtime_p = &dummy;
1848                 }
1849 #endif
1850 
1851                 {
1852                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1853                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1854                     __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1855 #if OMPT_SUPPORT
1856                         , exit_runtime_p
1857 #endif
1858                     );
1859                 }
1860 
1861 #if OMPT_SUPPORT
1862                 *exit_runtime_p = NULL;
1863                 if (ompt_enabled) {
1864 #if OMPT_TRACE
1865                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1866 
1867                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1868                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1869                             my_parallel_id, my_task_id);
1870                     }
1871 #endif
1872 
1873                     __ompt_lw_taskteam_unlink(master_th);
1874                     // reset clear the task id only after unlinking the task
1875                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1876 
1877                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1878                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1879                             ompt_parallel_id, ompt_task_id,
1880                             OMPT_INVOKER(call_context));
1881                     }
1882                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1883                 }
1884 #endif
1885 #if OMP_40_ENABLED
1886             }
1887 #endif /* OMP_40_ENABLED */
1888         }
1889         else if ( call_context == fork_context_gnu ) {
1890 #if OMPT_SUPPORT
1891             ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1892                 __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1893             __ompt_lw_taskteam_init(lwt, master_th, gtid,
1894                 unwrapped_task, ompt_parallel_id);
1895 
1896             lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1897             lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1898             __ompt_lw_taskteam_link(lwt, master_th);
1899 #endif
1900 
1901             // we were called from GNU native code
1902             KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1903             return FALSE;
1904         }
1905         else {
1906             KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1907         }
1908 
1909 
1910         KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1911         KMP_MB();
1912         return FALSE;
1913     }
1914 
1915     // GEH: only modify the executing flag in the case when not serialized
1916     //      serialized case is handled in kmpc_serialized_parallel
1917     KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1918                   parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1919                   master_th->th.th_current_task->td_icvs.max_active_levels ) );
1920     // TODO: GEH - cannot do this assertion because root thread not set up as executing
1921     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1922     master_th->th.th_current_task->td_flags.executing = 0;
1923 
1924 #if OMP_40_ENABLED
1925     if ( !master_th->th.th_teams_microtask || level > teams_level )
1926 #endif /* OMP_40_ENABLED */
1927     {
1928         /* Increment our nested depth level */
1929         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1930     }
1931 
1932     // See if we need to make a copy of the ICVs.
1933     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1934     if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1935         nthreads_icv = __kmp_nested_nth.nth[level+1];
1936     }
1937     else {
1938         nthreads_icv = 0;  // don't update
1939     }
1940 
1941 #if OMP_40_ENABLED
1942     // Figure out the proc_bind_policy for the new team.
1943     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1944     kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1945     if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1946         proc_bind = proc_bind_false;
1947     }
1948     else {
1949         if (proc_bind == proc_bind_default) {
1950             // No proc_bind clause specified; use current proc-bind-var for this parallel region
1951             proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1952         }
1953         /* else: The proc_bind policy was specified explicitly on parallel clause. This
1954            overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1955         // Figure the value of proc-bind-var for the child threads.
1956         if ((level+1 < __kmp_nested_proc_bind.used)
1957             && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1958             proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1959         }
1960     }
1961 
1962     // Reset for next parallel region
1963     master_th->th.th_set_proc_bind = proc_bind_default;
1964 #endif /* OMP_40_ENABLED */
1965 
1966     if ((nthreads_icv > 0)
1967 #if OMP_40_ENABLED
1968         || (proc_bind_icv != proc_bind_default)
1969 #endif /* OMP_40_ENABLED */
1970         ) {
1971         kmp_internal_control_t new_icvs;
1972         copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1973         new_icvs.next = NULL;
1974         if (nthreads_icv > 0) {
1975             new_icvs.nproc = nthreads_icv;
1976         }
1977 
1978 #if OMP_40_ENABLED
1979         if (proc_bind_icv != proc_bind_default) {
1980             new_icvs.proc_bind = proc_bind_icv;
1981         }
1982 #endif /* OMP_40_ENABLED */
1983 
1984         /* allocate a new parallel team */
1985         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1986         team = __kmp_allocate_team(root, nthreads, nthreads,
1987 #if OMPT_SUPPORT
1988                                    ompt_parallel_id,
1989 #endif
1990 #if OMP_40_ENABLED
1991                                    proc_bind,
1992 #endif
1993                                    &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1994     } else {
1995         /* allocate a new parallel team */
1996         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1997         team = __kmp_allocate_team(root, nthreads, nthreads,
1998 #if OMPT_SUPPORT
1999                                    ompt_parallel_id,
2000 #endif
2001 #if OMP_40_ENABLED
2002                                    proc_bind,
2003 #endif
2004                                    &master_th->th.th_current_task->td_icvs, argc
2005                                    USE_NESTED_HOT_ARG(master_th) );
2006     }
2007     KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2008 
2009     /* setup the new team */
2010     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2011     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2012     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2013     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2014     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2015 #if OMPT_SUPPORT
2016     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2017 #endif
2018     KMP_CHECK_UPDATE(team->t.t_invoke, invoker);  /* TODO move this to root, maybe */
2019     // TODO: parent_team->t.t_level == INT_MAX ???
2020 #if OMP_40_ENABLED
2021     if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2022 #endif /* OMP_40_ENABLED */
2023         int new_level = parent_team->t.t_level + 1;
2024         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2025         new_level = parent_team->t.t_active_level + 1;
2026         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2027 #if OMP_40_ENABLED
2028     } else {
2029         // AC: Do not increase parallel level at start of the teams construct
2030         int new_level = parent_team->t.t_level;
2031         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2032         new_level = parent_team->t.t_active_level;
2033         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2034     }
2035 #endif /* OMP_40_ENABLED */
2036     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2037     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2038         team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2039 
2040 #if OMP_40_ENABLED
2041     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2042 #endif
2043 
2044     // Update the floating point rounding in the team if required.
2045     propagateFPControl(team);
2046 
2047     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2048         // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2049 #if 0
2050         // Patch out an assertion that trips while the runtime seems to operate correctly.
2051         // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2052         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2053 #endif
2054         KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2055                       __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2056                       parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2057 
2058         if ( active_level || master_th->th.th_task_team ) {
2059             // Take a memo of master's task_state
2060             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2061             if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2062                 kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2063                 kmp_uint8 *old_stack, *new_stack;
2064                 kmp_uint32 i;
2065                 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2066                 for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2067                     new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2068                 }
2069                 for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2070                     new_stack[i] = 0;
2071                 }
2072                 old_stack = master_th->th.th_task_state_memo_stack;
2073                 master_th->th.th_task_state_memo_stack = new_stack;
2074                 master_th->th.th_task_state_stack_sz = new_size;
2075                 __kmp_free(old_stack);
2076             }
2077             // Store master's task_state on stack
2078             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2079             master_th->th.th_task_state_top++;
2080 #if KMP_NESTED_HOT_TEAMS
2081             if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2082                 master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2083             }
2084             else {
2085 #endif
2086                 master_th->th.th_task_state = 0;
2087 #if KMP_NESTED_HOT_TEAMS
2088             }
2089 #endif
2090         }
2091 #if !KMP_NESTED_HOT_TEAMS
2092         KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2093 #endif
2094     }
2095 
2096     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2097                 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2098     KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2099                       ( team->t.t_master_tid == 0 &&
2100                         ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2101     KMP_MB();
2102 
2103     /* now, setup the arguments */
2104     argv = (void**)team->t.t_argv;
2105 #if OMP_40_ENABLED
2106     if ( ap ) {
2107 #endif /* OMP_40_ENABLED */
2108         for ( i=argc-1; i >= 0; --i ) {
2109 // TODO: revert workaround for Intel(R) 64 tracker #96
2110 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2111             void *new_argv = va_arg(*ap, void *);
2112 #else
2113             void *new_argv = va_arg(ap, void *);
2114 #endif
2115             KMP_CHECK_UPDATE(*argv, new_argv);
2116             argv++;
2117         }
2118 #if OMP_40_ENABLED
2119     } else {
2120         for ( i=0; i < argc; ++i ) {
2121             // Get args from parent team for teams construct
2122             KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2123         }
2124     }
2125 #endif /* OMP_40_ENABLED */
2126 
2127     /* now actually fork the threads */
2128     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2129     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2130         root->r.r_active = TRUE;
2131 
2132     __kmp_fork_team_threads( root, team, master_th, gtid );
2133     __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2134 
2135 #if OMPT_SUPPORT
2136     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2137 #endif
2138 
2139     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2140 
2141 #if USE_ITT_BUILD
2142     if ( team->t.t_active_level == 1 // only report frames at level 1
2143 # if OMP_40_ENABLED
2144         && !master_th->th.th_teams_microtask // not in teams construct
2145 # endif /* OMP_40_ENABLED */
2146     ) {
2147 #if USE_ITT_NOTIFY
2148         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2149              ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2150         {
2151             kmp_uint64 tmp_time = 0;
2152             if ( __itt_get_timestamp_ptr )
2153                 tmp_time = __itt_get_timestamp();
2154             // Internal fork - report frame begin
2155             master_th->th.th_frame_time  = tmp_time;
2156             if ( __kmp_forkjoin_frames_mode == 3 )
2157                 team->t.t_region_time = tmp_time;
2158         } else // only one notification scheme (either "submit" or "forking/joined", not both)
2159 #endif /* USE_ITT_NOTIFY */
2160         if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2161              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2162         { // Mark start of "parallel" region for VTune.
2163             __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2164         }
2165     }
2166 #endif /* USE_ITT_BUILD */
2167 
2168     /* now go on and do the work */
2169     KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2170     KMP_MB();
2171     KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2172                   root, team, master_th, gtid));
2173 
2174 #if USE_ITT_BUILD
2175     if ( __itt_stack_caller_create_ptr ) {
2176         team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2177     }
2178 #endif /* USE_ITT_BUILD */
2179 
2180 #if OMP_40_ENABLED
2181     if ( ap )   // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2182 #endif /* OMP_40_ENABLED */
2183     {
2184         __kmp_internal_fork( loc, gtid, team );
2185         KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2186                       root, team, master_th, gtid));
2187     }
2188 
2189     if (call_context == fork_context_gnu) {
2190         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2191         return TRUE;
2192     }
2193 
2194     /* Invoke microtask for MASTER thread */
2195     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2196                 gtid, team->t.t_id, team->t.t_pkfn ) );
2197     }  // END of timer KMP_fork_call block
2198 
2199     {
2200         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2201         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2202         if (! team->t.t_invoke( gtid )) {
2203             KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2204         }
2205     }
2206     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2207         gtid, team->t.t_id, team->t.t_pkfn ) );
2208     KMP_MB();       /* Flush all pending memory write invalidates.  */
2209 
2210     KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2211 
2212 #if OMPT_SUPPORT
2213     if (ompt_enabled) {
2214         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2215     }
2216 #endif
2217 
2218     return TRUE;
2219 }
2220 
2221 #if OMPT_SUPPORT
2222 static inline void
2223 __kmp_join_restore_state(
2224     kmp_info_t *thread,
2225     kmp_team_t *team)
2226 {
2227     // restore state outside the region
2228     thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2229         ompt_state_work_serial : ompt_state_work_parallel);
2230 }
2231 
2232 static inline void
2233 __kmp_join_ompt(
2234     kmp_info_t *thread,
2235     kmp_team_t *team,
2236     ompt_parallel_id_t parallel_id,
2237     fork_context_e fork_context)
2238 {
2239     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2240     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2241         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2242             parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2243     }
2244 
2245     task_info->frame.reenter_runtime_frame = NULL;
2246     __kmp_join_restore_state(thread,team);
2247 }
2248 #endif
2249 
2250 void
2251 __kmp_join_call(ident_t *loc, int gtid
2252 #if OMPT_SUPPORT
2253                , enum fork_context_e fork_context
2254 #endif
2255 #if OMP_40_ENABLED
2256                , int exit_teams
2257 #endif /* OMP_40_ENABLED */
2258 )
2259 {
2260     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2261     kmp_team_t     *team;
2262     kmp_team_t     *parent_team;
2263     kmp_info_t     *master_th;
2264     kmp_root_t     *root;
2265     int             master_active;
2266     int             i;
2267 
2268     KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2269 
2270     /* setup current data */
2271     master_th     = __kmp_threads[ gtid ];
2272     root          = master_th->th.th_root;
2273     team          = master_th->th.th_team;
2274     parent_team   = team->t.t_parent;
2275 
2276     master_th->th.th_ident = loc;
2277 
2278 #if OMPT_SUPPORT
2279     if (ompt_enabled) {
2280         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2281     }
2282 #endif
2283 
2284 #if KMP_DEBUG
2285     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2286         KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2287                          __kmp_gtid_from_thread( master_th ), team,
2288                          team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2289         KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2290     }
2291 #endif
2292 
2293     if( team->t.t_serialized ) {
2294 #if OMP_40_ENABLED
2295         if ( master_th->th.th_teams_microtask ) {
2296             // We are in teams construct
2297             int level = team->t.t_level;
2298             int tlevel = master_th->th.th_teams_level;
2299             if ( level == tlevel ) {
2300                 // AC: we haven't incremented it earlier at start of teams construct,
2301                 //     so do it here - at the end of teams construct
2302                 team->t.t_level++;
2303             } else if ( level == tlevel + 1 ) {
2304                 // AC: we are exiting parallel inside teams, need to increment serialization
2305                 //     in order to restore it in the next call to __kmpc_end_serialized_parallel
2306                 team->t.t_serialized++;
2307             }
2308         }
2309 #endif /* OMP_40_ENABLED */
2310         __kmpc_end_serialized_parallel( loc, gtid );
2311 
2312 #if OMPT_SUPPORT
2313         if (ompt_enabled) {
2314             __kmp_join_restore_state(master_th, parent_team);
2315         }
2316 #endif
2317 
2318         return;
2319     }
2320 
2321     master_active = team->t.t_master_active;
2322 
2323 #if OMP_40_ENABLED
2324     if (!exit_teams)
2325 #endif /* OMP_40_ENABLED */
2326     {
2327         // AC: No barrier for internal teams at exit from teams construct.
2328         //     But there is barrier for external team (league).
2329         __kmp_internal_join( loc, gtid, team );
2330     }
2331 #if OMP_40_ENABLED
2332     else {
2333         master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2334     }
2335 #endif /* OMP_40_ENABLED */
2336 
2337     KMP_MB();
2338 
2339 #if OMPT_SUPPORT
2340     ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2341 #endif
2342 
2343 #if USE_ITT_BUILD
2344     if ( __itt_stack_caller_create_ptr ) {
2345         __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2346     }
2347 
2348     // Mark end of "parallel" region for VTune.
2349     if ( team->t.t_active_level == 1
2350 # if OMP_40_ENABLED
2351         && !master_th->th.th_teams_microtask /* not in teams construct */
2352 # endif /* OMP_40_ENABLED */
2353     ) {
2354         master_th->th.th_ident = loc;
2355         // only one notification scheme (either "submit" or "forking/joined", not both)
2356         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2357             __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2358                                     0, loc, master_th->th.th_team_nproc, 1 );
2359         else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2360             ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2361             __kmp_itt_region_joined( gtid );
2362     } // active_level == 1
2363 #endif /* USE_ITT_BUILD */
2364 
2365 #if OMP_40_ENABLED
2366     if ( master_th->th.th_teams_microtask &&
2367          !exit_teams &&
2368          team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2369          team->t.t_level == master_th->th.th_teams_level + 1 ) {
2370         // AC: We need to leave the team structure intact at the end
2371         //     of parallel inside the teams construct, so that at the next
2372         //     parallel same (hot) team works, only adjust nesting levels
2373 
2374         /* Decrement our nested depth level */
2375         team->t.t_level --;
2376         team->t.t_active_level --;
2377         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2378 
2379         /* Restore number of threads in the team if needed */
2380         if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2381             int old_num = master_th->th.th_team_nproc;
2382             int new_num = master_th->th.th_teams_size.nth;
2383             kmp_info_t **other_threads = team->t.t_threads;
2384             team->t.t_nproc = new_num;
2385             for ( i = 0; i < old_num; ++i ) {
2386                 other_threads[i]->th.th_team_nproc = new_num;
2387             }
2388             // Adjust states of non-used threads of the team
2389             for ( i = old_num; i < new_num; ++i ) {
2390                 // Re-initialize thread's barrier data.
2391                 int b;
2392                 kmp_balign_t * balign = other_threads[i]->th.th_bar;
2393                 for ( b = 0; b < bs_last_barrier; ++ b ) {
2394                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
2395                     KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2396 #if USE_DEBUGGER
2397                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2398 #endif
2399                 }
2400                 if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2401                     // Synchronize thread's task state
2402                     other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2403                 }
2404             }
2405         }
2406 
2407 #if OMPT_SUPPORT
2408         if (ompt_enabled) {
2409             __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2410         }
2411 #endif
2412 
2413         return;
2414     }
2415 #endif /* OMP_40_ENABLED */
2416 
2417     /* do cleanup and restore the parent team */
2418     master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2419     master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2420 
2421     master_th->th.th_dispatch =
2422                 & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2423 
2424     /* jc: The following lock has instructions with REL and ACQ semantics,
2425        separating the parallel user code called in this parallel region
2426        from the serial user code called after this function returns.
2427     */
2428     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2429 
2430 #if OMP_40_ENABLED
2431     if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2432 #endif /* OMP_40_ENABLED */
2433     {
2434         /* Decrement our nested depth level */
2435         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2436     }
2437     KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2438 
2439 #if OMPT_SUPPORT && OMPT_TRACE
2440     if(ompt_enabled){
2441         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2442         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2443              ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2444                parallel_id, task_info->task_id);
2445         }
2446         task_info->frame.exit_runtime_frame = NULL;
2447         task_info->task_id = 0;
2448     }
2449 #endif
2450 
2451     KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2452                    0, master_th, team ) );
2453     __kmp_pop_current_task_from_thread( master_th );
2454 
2455 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2456     //
2457     // Restore master thread's partition.
2458     //
2459     master_th->th.th_first_place = team->t.t_first_place;
2460     master_th->th.th_last_place = team->t.t_last_place;
2461 #endif /* OMP_40_ENABLED */
2462 
2463     updateHWFPControl (team);
2464 
2465     if ( root->r.r_active != master_active )
2466         root->r.r_active = master_active;
2467 
2468     __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2469 
2470     /* this race was fun to find.  make sure the following is in the critical
2471      * region otherwise assertions may fail occasionally since the old team
2472      * may be reallocated and the hierarchy appears inconsistent.  it is
2473      * actually safe to run and won't cause any bugs, but will cause those
2474      * assertion failures.  it's only one deref&assign so might as well put this
2475      * in the critical region */
2476     master_th->th.th_team        =   parent_team;
2477     master_th->th.th_team_nproc  =   parent_team->t.t_nproc;
2478     master_th->th.th_team_master =   parent_team->t.t_threads[0];
2479     master_th->th.th_team_serialized = parent_team->t.t_serialized;
2480 
2481     /* restore serialized team, if need be */
2482     if( parent_team->t.t_serialized &&
2483         parent_team != master_th->th.th_serial_team &&
2484         parent_team != root->r.r_root_team ) {
2485             __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2486             master_th->th.th_serial_team = parent_team;
2487     }
2488 
2489     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2490         if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2491             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2492             // Remember master's state if we re-use this nested hot team
2493             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2494             --master_th->th.th_task_state_top; // pop
2495             // Now restore state at this level
2496             master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2497         }
2498         // Copy the task team from the parent team to the master thread
2499         master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2500         KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2501                         __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2502     }
2503 
2504      // TODO: GEH - cannot do this assertion because root thread not set up as executing
2505      // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2506      master_th->th.th_current_task->td_flags.executing = 1;
2507 
2508     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2509 
2510 #if OMPT_SUPPORT
2511     if (ompt_enabled) {
2512         __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2513     }
2514 #endif
2515 
2516     KMP_MB();
2517     KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2518 }
2519 
2520 /* ------------------------------------------------------------------------ */
2521 /* ------------------------------------------------------------------------ */
2522 
2523 /* Check whether we should push an internal control record onto the
2524    serial team stack.  If so, do it.  */
2525 void
2526 __kmp_save_internal_controls ( kmp_info_t * thread )
2527 {
2528 
2529     if ( thread->th.th_team != thread->th.th_serial_team ) {
2530         return;
2531     }
2532     if (thread->th.th_team->t.t_serialized > 1) {
2533         int push = 0;
2534 
2535         if (thread->th.th_team->t.t_control_stack_top == NULL) {
2536             push = 1;
2537         } else {
2538             if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2539                  thread->th.th_team->t.t_serialized ) {
2540                 push = 1;
2541             }
2542         }
2543         if (push) {  /* push a record on the serial team's stack */
2544             kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2545 
2546             copy_icvs( control, & thread->th.th_current_task->td_icvs );
2547 
2548             control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2549 
2550             control->next = thread->th.th_team->t.t_control_stack_top;
2551             thread->th.th_team->t.t_control_stack_top = control;
2552         }
2553     }
2554 }
2555 
2556 /* Changes set_nproc */
2557 void
2558 __kmp_set_num_threads( int new_nth, int gtid )
2559 {
2560     kmp_info_t *thread;
2561     kmp_root_t *root;
2562 
2563     KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2564     KMP_DEBUG_ASSERT( __kmp_init_serial );
2565 
2566     if (new_nth < 1)
2567         new_nth = 1;
2568     else if (new_nth > __kmp_max_nth)
2569         new_nth = __kmp_max_nth;
2570 
2571     KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2572     thread = __kmp_threads[gtid];
2573 
2574     __kmp_save_internal_controls( thread );
2575 
2576     set__nproc( thread, new_nth );
2577 
2578     //
2579     // If this omp_set_num_threads() call will cause the hot team size to be
2580     // reduced (in the absence of a num_threads clause), then reduce it now,
2581     // rather than waiting for the next parallel region.
2582     //
2583     root = thread->th.th_root;
2584     if ( __kmp_init_parallel && ( ! root->r.r_active )
2585       && ( root->r.r_hot_team->t.t_nproc > new_nth )
2586 #if KMP_NESTED_HOT_TEAMS
2587       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2588 #endif
2589     ) {
2590         kmp_team_t *hot_team = root->r.r_hot_team;
2591         int f;
2592 
2593         __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2594 
2595         // Release the extra threads we don't need any more.
2596         for ( f = new_nth;  f < hot_team->t.t_nproc; f++ ) {
2597             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2598             if ( __kmp_tasking_mode != tskm_immediate_exec) {
2599                 // When decreasing team size, threads no longer in the team should unref task team.
2600                 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2601             }
2602             __kmp_free_thread( hot_team->t.t_threads[f] );
2603             hot_team->t.t_threads[f] =  NULL;
2604         }
2605         hot_team->t.t_nproc = new_nth;
2606 #if KMP_NESTED_HOT_TEAMS
2607         if( thread->th.th_hot_teams ) {
2608             KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2609             thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2610         }
2611 #endif
2612 
2613         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2614 
2615         //
2616         // Update the t_nproc field in the threads that are still active.
2617         //
2618         for( f=0 ; f < new_nth; f++ ) {
2619             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2620             hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2621         }
2622         // Special flag in case omp_set_num_threads() call
2623         hot_team->t.t_size_changed = -1;
2624     }
2625 }
2626 
2627 /* Changes max_active_levels */
2628 void
2629 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2630 {
2631     kmp_info_t *thread;
2632 
2633     KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2634     KMP_DEBUG_ASSERT( __kmp_init_serial );
2635 
2636     // validate max_active_levels
2637     if( max_active_levels < 0 ) {
2638         KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2639         // We ignore this call if the user has specified a negative value.
2640         // The current setting won't be changed. The last valid setting will be used.
2641         // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2642         KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2643         return;
2644     }
2645     if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2646         // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2647         // We allow a zero value. (implementation defined behavior)
2648     } else {
2649         KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT  );
2650         max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2651         // Current upper limit is MAX_INT. (implementation defined behavior)
2652         // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2653         // Actually, the flow should never get here until we use MAX_INT limit.
2654     }
2655     KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2656 
2657     thread = __kmp_threads[ gtid ];
2658 
2659     __kmp_save_internal_controls( thread );
2660 
2661     set__max_active_levels( thread, max_active_levels );
2662 
2663 }
2664 
2665 /* Gets max_active_levels */
2666 int
2667 __kmp_get_max_active_levels( int gtid )
2668 {
2669     kmp_info_t *thread;
2670 
2671     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2672     KMP_DEBUG_ASSERT( __kmp_init_serial );
2673 
2674     thread = __kmp_threads[ gtid ];
2675     KMP_DEBUG_ASSERT( thread->th.th_current_task );
2676     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2677         gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2678     return thread->th.th_current_task->td_icvs.max_active_levels;
2679 }
2680 
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682 void
2683 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2684 {
2685     kmp_info_t *thread;
2686 //    kmp_team_t *team;
2687 
2688     KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2689     KMP_DEBUG_ASSERT( __kmp_init_serial );
2690 
2691     // Check if the kind parameter is valid, correct if needed.
2692     // Valid parameters should fit in one of two intervals - standard or extended:
2693     //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694     // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2695     if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2696        ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2697     {
2698         // TODO: Hint needs attention in case we change the default schedule.
2699         __kmp_msg(
2700             kmp_ms_warning,
2701             KMP_MSG( ScheduleKindOutOfRange, kind ),
2702             KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2703             __kmp_msg_null
2704         );
2705         kind = kmp_sched_default;
2706         chunk = 0;         // ignore chunk value in case of bad kind
2707     }
2708 
2709     thread = __kmp_threads[ gtid ];
2710 
2711     __kmp_save_internal_controls( thread );
2712 
2713     if ( kind < kmp_sched_upper_std ) {
2714         if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2715             // differ static chunked vs. unchunked:
2716             // chunk should be invalid to indicate unchunked schedule (which is the default)
2717             thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2718         } else {
2719             thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2720         }
2721     } else {
2722         //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2723         thread->th.th_current_task->td_icvs.sched.r_sched_type =
2724             __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2725     }
2726     if ( kind == kmp_sched_auto ) {
2727         // ignore parameter chunk for schedule auto
2728         thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2729     } else {
2730         thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2731     }
2732 }
2733 
2734 /* Gets def_sched_var ICV values */
2735 void
2736 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2737 {
2738     kmp_info_t     *thread;
2739     enum sched_type th_type;
2740 
2741     KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2742     KMP_DEBUG_ASSERT( __kmp_init_serial );
2743 
2744     thread = __kmp_threads[ gtid ];
2745 
2746     th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2747 
2748     switch ( th_type ) {
2749     case kmp_sch_static:
2750     case kmp_sch_static_greedy:
2751     case kmp_sch_static_balanced:
2752         *kind = kmp_sched_static;
2753         *chunk = 0;   // chunk was not set, try to show this fact via zero value
2754         return;
2755     case kmp_sch_static_chunked:
2756         *kind = kmp_sched_static;
2757         break;
2758     case kmp_sch_dynamic_chunked:
2759         *kind = kmp_sched_dynamic;
2760         break;
2761     case kmp_sch_guided_chunked:
2762     case kmp_sch_guided_iterative_chunked:
2763     case kmp_sch_guided_analytical_chunked:
2764         *kind = kmp_sched_guided;
2765         break;
2766     case kmp_sch_auto:
2767         *kind = kmp_sched_auto;
2768         break;
2769     case kmp_sch_trapezoidal:
2770         *kind = kmp_sched_trapezoidal;
2771         break;
2772 #if KMP_STATIC_STEAL_ENABLED
2773     case kmp_sch_static_steal:
2774         *kind = kmp_sched_static_steal;
2775         break;
2776 #endif
2777     default:
2778         KMP_FATAL( UnknownSchedulingType, th_type );
2779     }
2780 
2781     *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2782 }
2783 
2784 int
2785 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2786 
2787     int ii, dd;
2788     kmp_team_t *team;
2789     kmp_info_t *thr;
2790 
2791     KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2792     KMP_DEBUG_ASSERT( __kmp_init_serial );
2793 
2794     // validate level
2795     if( level == 0 ) return 0;
2796     if( level < 0 ) return -1;
2797     thr = __kmp_threads[ gtid ];
2798     team = thr->th.th_team;
2799     ii = team->t.t_level;
2800     if( level > ii ) return -1;
2801 
2802 #if OMP_40_ENABLED
2803     if( thr->th.th_teams_microtask ) {
2804         // AC: we are in teams region where multiple nested teams have same level
2805         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2806         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2807             KMP_DEBUG_ASSERT( ii >= tlevel );
2808             // AC: As we need to pass by the teams league, we need to artificially increase ii
2809             if ( ii == tlevel ) {
2810                 ii += 2; // three teams have same level
2811             } else {
2812                 ii ++;   // two teams have same level
2813             }
2814         }
2815     }
2816 #endif
2817 
2818     if( ii == level ) return __kmp_tid_from_gtid( gtid );
2819 
2820     dd = team->t.t_serialized;
2821     level++;
2822     while( ii > level )
2823     {
2824         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2825         {
2826         }
2827         if( ( team->t.t_serialized ) && ( !dd ) ) {
2828             team = team->t.t_parent;
2829             continue;
2830         }
2831         if( ii > level ) {
2832             team = team->t.t_parent;
2833             dd = team->t.t_serialized;
2834             ii--;
2835         }
2836     }
2837 
2838     return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2839 }
2840 
2841 int
2842 __kmp_get_team_size( int gtid, int level ) {
2843 
2844     int ii, dd;
2845     kmp_team_t *team;
2846     kmp_info_t *thr;
2847 
2848     KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2849     KMP_DEBUG_ASSERT( __kmp_init_serial );
2850 
2851     // validate level
2852     if( level == 0 ) return 1;
2853     if( level < 0 ) return -1;
2854     thr = __kmp_threads[ gtid ];
2855     team = thr->th.th_team;
2856     ii = team->t.t_level;
2857     if( level > ii ) return -1;
2858 
2859 #if OMP_40_ENABLED
2860     if( thr->th.th_teams_microtask ) {
2861         // AC: we are in teams region where multiple nested teams have same level
2862         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2863         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2864             KMP_DEBUG_ASSERT( ii >= tlevel );
2865             // AC: As we need to pass by the teams league, we need to artificially increase ii
2866             if ( ii == tlevel ) {
2867                 ii += 2; // three teams have same level
2868             } else {
2869                 ii ++;   // two teams have same level
2870             }
2871         }
2872     }
2873 #endif
2874 
2875     while( ii > level )
2876     {
2877         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2878         {
2879         }
2880         if( team->t.t_serialized && ( !dd ) ) {
2881             team = team->t.t_parent;
2882             continue;
2883         }
2884         if( ii > level ) {
2885             team = team->t.t_parent;
2886             ii--;
2887         }
2888     }
2889 
2890     return team->t.t_nproc;
2891 }
2892 
2893 kmp_r_sched_t
2894 __kmp_get_schedule_global() {
2895 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2896 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2897 
2898     kmp_r_sched_t r_sched;
2899 
2900     // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2901     // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2902     // and thus have different run-time schedules in different roots (even in OMP 2.5)
2903     if ( __kmp_sched == kmp_sch_static ) {
2904         r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2905     } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2906         r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2907     } else {
2908         r_sched.r_sched_type = __kmp_sched;  // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2909     }
2910 
2911     if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2912         r_sched.chunk = KMP_DEFAULT_CHUNK;
2913     } else {
2914         r_sched.chunk = __kmp_chunk;
2915     }
2916 
2917     return r_sched;
2918 }
2919 
2920 /* ------------------------------------------------------------------------ */
2921 /* ------------------------------------------------------------------------ */
2922 
2923 
2924 /*
2925  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2926  * at least argc number of *t_argv entries for the requested team.
2927  */
2928 static void
2929 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2930 {
2931 
2932     KMP_DEBUG_ASSERT( team );
2933     if( !realloc || argc > team->t.t_max_argc ) {
2934 
2935         KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2936                          team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2937         /* if previously allocated heap space for args, free them */
2938         if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2939             __kmp_free( (void *) team->t.t_argv );
2940 
2941         if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2942             /* use unused space in the cache line for arguments */
2943             team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2944             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2945                              team->t.t_id, team->t.t_max_argc ));
2946             team->t.t_argv = &team->t.t_inline_argv[0];
2947             if ( __kmp_storage_map ) {
2948                 __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2949                                          &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2950                                          (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2951                                          "team_%d.t_inline_argv",
2952                                          team->t.t_id );
2953             }
2954         } else {
2955             /* allocate space for arguments in the heap */
2956             team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2957                                      KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2958             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2959                              team->t.t_id, team->t.t_max_argc ));
2960             team->t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2961             if ( __kmp_storage_map ) {
2962                 __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2963                                          sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2964                                          team->t.t_id );
2965             }
2966         }
2967     }
2968 }
2969 
2970 static void
2971 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2972 {
2973     int i;
2974     int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2975     team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2976     team->t.t_disp_buffer = (dispatch_shared_info_t*)
2977         __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2978     team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2979     team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2980     team->t.t_max_nproc = max_nth;
2981 
2982     /* setup dispatch buffers */
2983     for(i = 0 ; i < num_disp_buff; ++i) {
2984         team->t.t_disp_buffer[i].buffer_index = i;
2985 #if OMP_45_ENABLED
2986         team->t.t_disp_buffer[i].doacross_buf_idx = i;
2987 #endif
2988     }
2989 }
2990 
2991 static void
2992 __kmp_free_team_arrays(kmp_team_t *team) {
2993     /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2994     int i;
2995     for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2996         if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2997             __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2998             team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2999         }; // if
3000     }; // for
3001     __kmp_free(team->t.t_threads);
3002     __kmp_free(team->t.t_disp_buffer);
3003     __kmp_free(team->t.t_dispatch);
3004     __kmp_free(team->t.t_implicit_task_taskdata);
3005     team->t.t_threads     = NULL;
3006     team->t.t_disp_buffer = NULL;
3007     team->t.t_dispatch    = NULL;
3008     team->t.t_implicit_task_taskdata = 0;
3009 }
3010 
3011 static void
3012 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3013     kmp_info_t **oldThreads = team->t.t_threads;
3014 
3015     __kmp_free(team->t.t_disp_buffer);
3016     __kmp_free(team->t.t_dispatch);
3017     __kmp_free(team->t.t_implicit_task_taskdata);
3018     __kmp_allocate_team_arrays(team, max_nth);
3019 
3020     KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3021 
3022     __kmp_free(oldThreads);
3023 }
3024 
3025 static kmp_internal_control_t
3026 __kmp_get_global_icvs( void ) {
3027 
3028     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3029 
3030 #if OMP_40_ENABLED
3031     KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3032 #endif /* OMP_40_ENABLED */
3033 
3034     kmp_internal_control_t g_icvs = {
3035       0,                            //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3036       (kmp_int8)__kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
3037       (kmp_int8)__kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
3038       (kmp_int8)__kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
3039       __kmp_dflt_blocktime,         //int blocktime;            //internal control for blocktime
3040 #if KMP_USE_MONITOR
3041       __kmp_bt_intervals,           //int bt_intervals;         //internal control for blocktime intervals
3042 #endif
3043       __kmp_dflt_team_nth,          //int nproc;                //internal control for # of threads for next parallel region (per thread)
3044                                     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3045       __kmp_dflt_max_active_levels, //int max_active_levels;    //internal control for max_active_levels
3046       r_sched,                      //kmp_r_sched_t sched;      //internal control for runtime schedule {sched,chunk} pair
3047 #if OMP_40_ENABLED
3048       __kmp_nested_proc_bind.bind_types[0],
3049       __kmp_default_device,
3050 #endif /* OMP_40_ENABLED */
3051       NULL                          //struct kmp_internal_control *next;
3052     };
3053 
3054     return g_icvs;
3055 }
3056 
3057 static kmp_internal_control_t
3058 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3059 
3060     kmp_internal_control_t gx_icvs;
3061     gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3062     copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3063     gx_icvs.next = NULL;
3064 
3065     return gx_icvs;
3066 }
3067 
3068 static void
3069 __kmp_initialize_root( kmp_root_t *root )
3070 {
3071     int           f;
3072     kmp_team_t   *root_team;
3073     kmp_team_t   *hot_team;
3074     int           hot_team_max_nth;
3075     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3076     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3077     KMP_DEBUG_ASSERT( root );
3078     KMP_ASSERT( ! root->r.r_begin );
3079 
3080     /* setup the root state structure */
3081     __kmp_init_lock( &root->r.r_begin_lock );
3082     root->r.r_begin        = FALSE;
3083     root->r.r_active       = FALSE;
3084     root->r.r_in_parallel  = 0;
3085     root->r.r_blocktime    = __kmp_dflt_blocktime;
3086     root->r.r_nested       = __kmp_dflt_nested;
3087 
3088     /* setup the root team for this task */
3089     /* allocate the root team structure */
3090     KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3091 
3092     root_team =
3093         __kmp_allocate_team(
3094             root,
3095             1,                                                         // new_nproc
3096             1,                                                         // max_nproc
3097 #if OMPT_SUPPORT
3098             0, // root parallel id
3099 #endif
3100 #if OMP_40_ENABLED
3101             __kmp_nested_proc_bind.bind_types[0],
3102 #endif
3103             &r_icvs,
3104             0                                                          // argc
3105             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3106         );
3107 #if USE_DEBUGGER
3108     // Non-NULL value should be assigned to make the debugger display the root team.
3109     TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3110 #endif
3111 
3112     KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3113 
3114     root->r.r_root_team = root_team;
3115     root_team->t.t_control_stack_top = NULL;
3116 
3117     /* initialize root team */
3118     root_team->t.t_threads[0] = NULL;
3119     root_team->t.t_nproc      = 1;
3120     root_team->t.t_serialized = 1;
3121     // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3122     root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3123     root_team->t.t_sched.chunk        = r_sched.chunk;
3124     KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3125                     root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3126 
3127     /* setup the  hot team for this task */
3128     /* allocate the hot team structure */
3129     KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3130 
3131     hot_team =
3132         __kmp_allocate_team(
3133             root,
3134             1,                                                         // new_nproc
3135             __kmp_dflt_team_nth_ub * 2,                                // max_nproc
3136 #if OMPT_SUPPORT
3137             0, // root parallel id
3138 #endif
3139 #if OMP_40_ENABLED
3140             __kmp_nested_proc_bind.bind_types[0],
3141 #endif
3142             &r_icvs,
3143             0                                                          // argc
3144             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3145         );
3146     KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3147 
3148     root->r.r_hot_team = hot_team;
3149     root_team->t.t_control_stack_top = NULL;
3150 
3151     /* first-time initialization */
3152     hot_team->t.t_parent = root_team;
3153 
3154     /* initialize hot team */
3155     hot_team_max_nth = hot_team->t.t_max_nproc;
3156     for ( f = 0; f < hot_team_max_nth; ++ f ) {
3157         hot_team->t.t_threads[ f ] = NULL;
3158     }; // for
3159     hot_team->t.t_nproc = 1;
3160     // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3161     hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3162     hot_team->t.t_sched.chunk        = r_sched.chunk;
3163     hot_team->t.t_size_changed = 0;
3164 }
3165 
3166 #ifdef KMP_DEBUG
3167 
3168 
3169 typedef struct kmp_team_list_item {
3170     kmp_team_p const *           entry;
3171     struct kmp_team_list_item *  next;
3172 } kmp_team_list_item_t;
3173 typedef kmp_team_list_item_t * kmp_team_list_t;
3174 
3175 
3176 static void
3177 __kmp_print_structure_team_accum(    // Add team to list of teams.
3178     kmp_team_list_t     list,        // List of teams.
3179     kmp_team_p const *  team         // Team to add.
3180 ) {
3181 
3182     // List must terminate with item where both entry and next are NULL.
3183     // Team is added to the list only once.
3184     // List is sorted in ascending order by team id.
3185     // Team id is *not* a key.
3186 
3187     kmp_team_list_t l;
3188 
3189     KMP_DEBUG_ASSERT( list != NULL );
3190     if ( team == NULL ) {
3191         return;
3192     }; // if
3193 
3194     __kmp_print_structure_team_accum( list, team->t.t_parent );
3195     __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3196 
3197     // Search list for the team.
3198     l = list;
3199     while ( l->next != NULL && l->entry != team ) {
3200         l = l->next;
3201     }; // while
3202     if ( l->next != NULL ) {
3203         return;  // Team has been added before, exit.
3204     }; // if
3205 
3206     // Team is not found. Search list again for insertion point.
3207     l = list;
3208     while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3209         l = l->next;
3210     }; // while
3211 
3212     // Insert team.
3213     {
3214         kmp_team_list_item_t * item =
3215             (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof(  kmp_team_list_item_t ) );
3216         * item = * l;
3217         l->entry = team;
3218         l->next  = item;
3219     }
3220 
3221 }
3222 
3223 static void
3224 __kmp_print_structure_team(
3225     char const *       title,
3226     kmp_team_p const * team
3227 
3228 ) {
3229     __kmp_printf( "%s", title );
3230     if ( team != NULL ) {
3231         __kmp_printf( "%2x %p\n", team->t.t_id, team );
3232     } else {
3233         __kmp_printf( " - (nil)\n" );
3234     }; // if
3235 }
3236 
3237 static void
3238 __kmp_print_structure_thread(
3239     char const *       title,
3240     kmp_info_p const * thread
3241 
3242 ) {
3243     __kmp_printf( "%s", title );
3244     if ( thread != NULL ) {
3245         __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3246     } else {
3247         __kmp_printf( " - (nil)\n" );
3248     }; // if
3249 }
3250 
3251 void
3252 __kmp_print_structure(
3253     void
3254 ) {
3255 
3256     kmp_team_list_t list;
3257 
3258     // Initialize list of teams.
3259     list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3260     list->entry = NULL;
3261     list->next  = NULL;
3262 
3263     __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3264     {
3265         int gtid;
3266         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3267             __kmp_printf( "%2d", gtid );
3268             if ( __kmp_threads != NULL ) {
3269                 __kmp_printf( " %p", __kmp_threads[ gtid ] );
3270             }; // if
3271             if ( __kmp_root != NULL ) {
3272                 __kmp_printf( " %p", __kmp_root[ gtid ] );
3273             }; // if
3274             __kmp_printf( "\n" );
3275         }; // for gtid
3276     }
3277 
3278     // Print out __kmp_threads array.
3279     __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3280     if ( __kmp_threads != NULL ) {
3281         int gtid;
3282         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3283             kmp_info_t const * thread = __kmp_threads[ gtid ];
3284             if ( thread != NULL ) {
3285                 __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3286                 __kmp_printf(                 "    Our Root:        %p\n", thread->th.th_root );
3287                 __kmp_print_structure_team(   "    Our Team:     ",        thread->th.th_team );
3288                 __kmp_print_structure_team(   "    Serial Team:  ",        thread->th.th_serial_team );
3289                 __kmp_printf(                 "    Threads:      %2d\n",   thread->th.th_team_nproc );
3290                 __kmp_print_structure_thread( "    Master:       ",        thread->th.th_team_master );
3291                 __kmp_printf(                 "    Serialized?:  %2d\n",   thread->th.th_team_serialized );
3292                 __kmp_printf(                 "    Set NProc:    %2d\n",   thread->th.th_set_nproc );
3293 #if OMP_40_ENABLED
3294                 __kmp_printf(                 "    Set Proc Bind: %2d\n",  thread->th.th_set_proc_bind );
3295 #endif
3296                 __kmp_print_structure_thread( "    Next in pool: ",        thread->th.th_next_pool );
3297                 __kmp_printf( "\n" );
3298                 __kmp_print_structure_team_accum( list, thread->th.th_team );
3299                 __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3300             }; // if
3301         }; // for gtid
3302     } else {
3303         __kmp_printf( "Threads array is not allocated.\n" );
3304     }; // if
3305 
3306     // Print out __kmp_root array.
3307     __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3308     if ( __kmp_root != NULL ) {
3309         int gtid;
3310         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3311             kmp_root_t const * root = __kmp_root[ gtid ];
3312             if ( root != NULL ) {
3313                 __kmp_printf( "GTID %2d %p:\n", gtid, root );
3314                 __kmp_print_structure_team(   "    Root Team:    ",      root->r.r_root_team );
3315                 __kmp_print_structure_team(   "    Hot Team:     ",      root->r.r_hot_team );
3316                 __kmp_print_structure_thread( "    Uber Thread:  ",      root->r.r_uber_thread );
3317                 __kmp_printf(                 "    Active?:      %2d\n", root->r.r_active );
3318                 __kmp_printf(                 "    Nested?:      %2d\n", root->r.r_nested );
3319                 __kmp_printf(                 "    In Parallel:  %2d\n", root->r.r_in_parallel );
3320                 __kmp_printf( "\n" );
3321                 __kmp_print_structure_team_accum( list, root->r.r_root_team );
3322                 __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3323             }; // if
3324         }; // for gtid
3325     } else {
3326         __kmp_printf( "Ubers array is not allocated.\n" );
3327     }; // if
3328 
3329     __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3330     while ( list->next != NULL ) {
3331         kmp_team_p const * team = list->entry;
3332         int i;
3333         __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3334         __kmp_print_structure_team( "    Parent Team:      ",      team->t.t_parent );
3335         __kmp_printf(               "    Master TID:       %2d\n", team->t.t_master_tid );
3336         __kmp_printf(               "    Max threads:      %2d\n", team->t.t_max_nproc );
3337         __kmp_printf(               "    Levels of serial: %2d\n", team->t.t_serialized );
3338         __kmp_printf(               "    Number threads:   %2d\n", team->t.t_nproc );
3339         for ( i = 0; i < team->t.t_nproc; ++ i ) {
3340             __kmp_printf(           "    Thread %2d:      ", i );
3341             __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3342         }; // for i
3343         __kmp_print_structure_team( "    Next in pool:     ",      team->t.t_next_pool );
3344         __kmp_printf( "\n" );
3345         list = list->next;
3346     }; // while
3347 
3348     // Print out __kmp_thread_pool and __kmp_team_pool.
3349     __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3350     __kmp_print_structure_thread(   "Thread pool:          ", (kmp_info_t *)__kmp_thread_pool );
3351     __kmp_print_structure_team(     "Team pool:            ", (kmp_team_t *)__kmp_team_pool );
3352     __kmp_printf( "\n" );
3353 
3354     // Free team list.
3355     while ( list != NULL ) {
3356         kmp_team_list_item_t * item = list;
3357         list = list->next;
3358         KMP_INTERNAL_FREE( item );
3359     }; // while
3360 
3361 }
3362 
3363 #endif
3364 
3365 
3366 //---------------------------------------------------------------------------
3367 //  Stuff for per-thread fast random number generator
3368 //  Table of primes
3369 
3370 static const unsigned __kmp_primes[] = {
3371   0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3372   0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3373   0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3374   0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3375   0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3376   0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3377   0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3378   0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3379   0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3380   0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3381   0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3382   0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3383   0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3384   0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3385   0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3386   0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3387 };
3388 
3389 //---------------------------------------------------------------------------
3390 //  __kmp_get_random: Get a random number using a linear congruential method.
3391 
3392 unsigned short
3393 __kmp_get_random( kmp_info_t * thread )
3394 {
3395   unsigned x = thread->th.th_x;
3396   unsigned short r = x>>16;
3397 
3398   thread->th.th_x = x*thread->th.th_a+1;
3399 
3400   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3401          thread->th.th_info.ds.ds_tid, r) );
3402 
3403   return r;
3404 }
3405 //--------------------------------------------------------
3406 // __kmp_init_random: Initialize a random number generator
3407 
3408 void
3409 __kmp_init_random( kmp_info_t * thread )
3410 {
3411   unsigned seed = thread->th.th_info.ds.ds_tid;
3412 
3413   thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3414   thread->th.th_x = (seed+1)*thread->th.th_a+1;
3415   KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3416 }
3417 
3418 
3419 #if KMP_OS_WINDOWS
3420 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3421 static int
3422 __kmp_reclaim_dead_roots(void) {
3423     int i, r = 0;
3424 
3425     for(i = 0; i < __kmp_threads_capacity; ++i) {
3426         if( KMP_UBER_GTID( i ) &&
3427           !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3428           !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3429             r += __kmp_unregister_root_other_thread(i);
3430         }
3431     }
3432     return r;
3433 }
3434 #endif
3435 
3436 /*
3437    This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3438    free entries generated.
3439 
3440    For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3441    already dead.
3442 
3443    On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3444    update to __kmp_threads_capacity.  Array capacity is increased by doubling with clipping to
3445     __kmp_tp_capacity, if threadprivate cache array has been created.
3446    Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3447 
3448    After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3449    of a total of nWish free slots, the function does that expansion.  If not, but the clipping value allows
3450    array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3451    Otherwise, nothing is done beyond the possible initial root thread reclamation.  However, if nNeed is zero,
3452    a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3453    as many free slots as possible up to nWish.
3454 
3455    If any argument is negative, the behavior is undefined.
3456 */
3457 static int
3458 __kmp_expand_threads(int nWish, int nNeed) {
3459     int added = 0;
3460     int old_tp_cached;
3461     int __kmp_actual_max_nth;
3462 
3463     if(nNeed > nWish) /* normalize the arguments */
3464         nWish = nNeed;
3465 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3466 /* only for Windows static library */
3467     /* reclaim array entries for root threads that are already dead */
3468     added = __kmp_reclaim_dead_roots();
3469 
3470     if(nNeed) {
3471         nNeed -= added;
3472         if(nNeed < 0)
3473             nNeed = 0;
3474     }
3475     if(nWish) {
3476         nWish -= added;
3477         if(nWish < 0)
3478             nWish = 0;
3479     }
3480 #endif
3481     if(nWish <= 0)
3482         return added;
3483 
3484     while(1) {
3485         int nTarget;
3486         int minimumRequiredCapacity;
3487         int newCapacity;
3488         kmp_info_t **newThreads;
3489         kmp_root_t **newRoot;
3490 
3491         //
3492         // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3493         // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3494         // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3495         // become > __kmp_max_nth in one of two ways:
3496         //
3497         // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3498         //    may not be resused by another thread, so we may need to increase
3499         //    __kmp_threads_capacity to __kmp_max_threads + 1.
3500         //
3501         // 2) New foreign root(s) are encountered.  We always register new
3502         //    foreign roots.  This may cause a smaller # of threads to be
3503         //    allocated at subsequent parallel regions, but the worker threads
3504         //    hang around (and eventually go to sleep) and need slots in the
3505         //    __kmp_threads[] array.
3506         //
3507         // Anyway, that is the reason for moving the check to see if
3508         // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3509         // instead of having it performed here. -BB
3510         //
3511         old_tp_cached = __kmp_tp_cached;
3512         __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3513         KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3514 
3515         /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3516         nTarget = nWish;
3517         if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3518             /* can't fulfil nWish, so try nNeed */
3519             if(nNeed) {
3520                 nTarget = nNeed;
3521                 if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3522                     /* possible expansion too small -- give up */
3523                     break;
3524                 }
3525             } else {
3526                 /* best-effort */
3527                 nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3528                 if(!nTarget) {
3529                     /* can expand at all -- give up */
3530                     break;
3531                 }
3532             }
3533         }
3534         minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3535 
3536         newCapacity = __kmp_threads_capacity;
3537         do{
3538             newCapacity =
3539                 newCapacity <= (__kmp_actual_max_nth >> 1) ?
3540                 (newCapacity << 1) :
3541                 __kmp_actual_max_nth;
3542         } while(newCapacity < minimumRequiredCapacity);
3543         newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3544         newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3545         KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3546         KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3547         memset(newThreads + __kmp_threads_capacity, 0,
3548                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3549         memset(newRoot + __kmp_threads_capacity, 0,
3550                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3551 
3552         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3553             /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3554                while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3555                cache capacity, so we should deallocate the expanded arrays and try again.  This is the first check
3556                of a double-check pair.
3557             */
3558             __kmp_free(newThreads);
3559             continue; /* start over and try again */
3560         }
3561         __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3562         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3563             /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3564             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3565             __kmp_free(newThreads);
3566             continue; /* start over and try again */
3567         } else {
3568             /* success */
3569             // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3570             //
3571             *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3572             *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3573             added += newCapacity - __kmp_threads_capacity;
3574             *(volatile int*)&__kmp_threads_capacity = newCapacity;
3575             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3576             break; /* succeeded, so we can exit the loop */
3577         }
3578     }
3579     return added;
3580 }
3581 
3582 /* register the current thread as a root thread and obtain our gtid */
3583 /* we must have the __kmp_initz_lock held at this point */
3584 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3585 int
3586 __kmp_register_root( int initial_thread )
3587 {
3588     kmp_info_t *root_thread;
3589     kmp_root_t *root;
3590     int         gtid;
3591     int         capacity;
3592     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3593     KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3594     KMP_MB();
3595 
3596 
3597     /*
3598         2007-03-02:
3599 
3600         If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3601         "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3602         return false (that means there is at least one empty slot in __kmp_threads array), but it
3603         is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3604         used for this one. Following code workarounds this bug.
3605 
3606         However, right solution seems to be not reserving slot #0 for initial thread because:
3607             (1) there is no magic in slot #0,
3608             (2) we cannot detect initial thread reliably (the first thread which does serial
3609                 initialization may be not a real initial thread).
3610     */
3611     capacity = __kmp_threads_capacity;
3612     if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3613         -- capacity;
3614     }; // if
3615 
3616     /* see if there are too many threads */
3617     if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3618         if ( __kmp_tp_cached ) {
3619             __kmp_msg(
3620                 kmp_ms_fatal,
3621                 KMP_MSG( CantRegisterNewThread ),
3622                 KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3623                 KMP_HNT( PossibleSystemLimitOnThreads ),
3624                 __kmp_msg_null
3625             );
3626         }
3627         else {
3628             __kmp_msg(
3629                 kmp_ms_fatal,
3630                 KMP_MSG( CantRegisterNewThread ),
3631                 KMP_HNT( SystemLimitOnThreads ),
3632                 __kmp_msg_null
3633             );
3634         }
3635     }; // if
3636 
3637     /* find an available thread slot */
3638     /* Don't reassign the zero slot since we need that to only be used by initial
3639        thread */
3640     for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3641         ;
3642     KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3643     KMP_ASSERT( gtid < __kmp_threads_capacity );
3644 
3645     /* update global accounting */
3646     __kmp_all_nth ++;
3647     TCW_4(__kmp_nth, __kmp_nth + 1);
3648 
3649     //
3650     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3651     // for low numbers of procs, and method #2 (keyed API call) for higher
3652     // numbers of procs.
3653     //
3654     if ( __kmp_adjust_gtid_mode ) {
3655         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3656             if ( TCR_4(__kmp_gtid_mode) != 2) {
3657                 TCW_4(__kmp_gtid_mode, 2);
3658             }
3659         }
3660         else {
3661             if (TCR_4(__kmp_gtid_mode) != 1 ) {
3662                 TCW_4(__kmp_gtid_mode, 1);
3663             }
3664         }
3665     }
3666 
3667 #ifdef KMP_ADJUST_BLOCKTIME
3668     /* Adjust blocktime to zero if necessary            */
3669     /* Middle initialization might not have occurred yet */
3670     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3671         if ( __kmp_nth > __kmp_avail_proc ) {
3672             __kmp_zero_bt = TRUE;
3673         }
3674     }
3675 #endif /* KMP_ADJUST_BLOCKTIME */
3676 
3677     /* setup this new hierarchy */
3678     if( ! ( root = __kmp_root[gtid] )) {
3679         root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3680         KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3681     }
3682 
3683 #if KMP_STATS_ENABLED
3684     // Initialize stats as soon as possible (right after gtid assignment).
3685     __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3686     KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3687     KMP_SET_THREAD_STATE(SERIAL_REGION);
3688     KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3689 #endif
3690     __kmp_initialize_root( root );
3691 
3692     /* setup new root thread structure */
3693     if( root->r.r_uber_thread ) {
3694         root_thread = root->r.r_uber_thread;
3695     } else {
3696         root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3697         if ( __kmp_storage_map ) {
3698             __kmp_print_thread_storage_map( root_thread, gtid );
3699         }
3700         root_thread->th.th_info .ds.ds_gtid = gtid;
3701         root_thread->th.th_root =  root;
3702         if( __kmp_env_consistency_check ) {
3703             root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3704         }
3705         #if USE_FAST_MEMORY
3706             __kmp_initialize_fast_memory( root_thread );
3707         #endif /* USE_FAST_MEMORY */
3708 
3709         #if KMP_USE_BGET
3710             KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3711             __kmp_initialize_bget( root_thread );
3712         #endif
3713         __kmp_init_random( root_thread );  // Initialize random number generator
3714     }
3715 
3716     /* setup the serial team held in reserve by the root thread */
3717     if( ! root_thread->th.th_serial_team ) {
3718         kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3719         KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3720 
3721         root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3722 #if OMPT_SUPPORT
3723           0, // root parallel id
3724 #endif
3725 #if OMP_40_ENABLED
3726           proc_bind_default,
3727 #endif
3728           &r_icvs,
3729           0 USE_NESTED_HOT_ARG(NULL) );
3730     }
3731     KMP_ASSERT( root_thread->th.th_serial_team );
3732     KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3733       root_thread->th.th_serial_team ) );
3734 
3735     /* drop root_thread into place */
3736     TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3737 
3738     root->r.r_root_team->t.t_threads[0] = root_thread;
3739     root->r.r_hot_team ->t.t_threads[0] = root_thread;
3740     root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3741     root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3742     root->r.r_uber_thread = root_thread;
3743 
3744     /* initialize the thread, get it ready to go */
3745     __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3746     TCW_4(__kmp_init_gtid, TRUE);
3747 
3748     /* prepare the master thread for get_gtid() */
3749     __kmp_gtid_set_specific( gtid );
3750 
3751 #if USE_ITT_BUILD
3752     __kmp_itt_thread_name( gtid );
3753 #endif /* USE_ITT_BUILD */
3754 
3755     #ifdef KMP_TDATA_GTID
3756         __kmp_gtid = gtid;
3757     #endif
3758     __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3759     KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3760 
3761     KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3762                     gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3763                     root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3764                     KMP_INIT_BARRIER_STATE ) );
3765     { // Initialize barrier data.
3766         int b;
3767         for ( b = 0; b < bs_last_barrier; ++ b ) {
3768             root_thread->th.th_bar[ b ].bb.b_arrived        = KMP_INIT_BARRIER_STATE;
3769 #if USE_DEBUGGER
3770             root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3771 #endif
3772         }; // for
3773     }
3774     KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3775 
3776 #if KMP_AFFINITY_SUPPORTED
3777 # if OMP_40_ENABLED
3778     root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3779     root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3780     root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3781     root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3782 # endif
3783 
3784     if ( TCR_4(__kmp_init_middle) ) {
3785         __kmp_affinity_set_init_mask( gtid, TRUE );
3786     }
3787 #endif /* KMP_AFFINITY_SUPPORTED */
3788 
3789     __kmp_root_counter ++;
3790 
3791     KMP_MB();
3792     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3793 
3794     return gtid;
3795 }
3796 
3797 #if KMP_NESTED_HOT_TEAMS
3798 static int
3799 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3800 {
3801     int i, n, nth;
3802     kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3803     if( !hot_teams || !hot_teams[level].hot_team ) {
3804         return 0;
3805     }
3806     KMP_DEBUG_ASSERT( level < max_level );
3807     kmp_team_t *team = hot_teams[level].hot_team;
3808     nth = hot_teams[level].hot_team_nth;
3809     n = nth - 1;                   // master is not freed
3810     if( level < max_level - 1 ) {
3811         for( i = 0; i < nth; ++i ) {
3812             kmp_info_t *th = team->t.t_threads[i];
3813             n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3814             if( i > 0 && th->th.th_hot_teams ) {
3815                 __kmp_free( th->th.th_hot_teams );
3816                 th->th.th_hot_teams = NULL;
3817             }
3818         }
3819     }
3820     __kmp_free_team( root, team, NULL );
3821     return n;
3822 }
3823 #endif
3824 
3825 /* Resets a root thread and clear its root and hot teams.
3826    Returns the number of __kmp_threads entries directly and indirectly freed.
3827 */
3828 static int
3829 __kmp_reset_root(int gtid, kmp_root_t *root)
3830 {
3831     kmp_team_t * root_team = root->r.r_root_team;
3832     kmp_team_t * hot_team  = root->r.r_hot_team;
3833     int          n         = hot_team->t.t_nproc;
3834     int i;
3835 
3836     KMP_DEBUG_ASSERT( ! root->r.r_active );
3837 
3838     root->r.r_root_team = NULL;
3839     root->r.r_hot_team  = NULL;
3840         // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3841         // to __kmp_free_team().
3842     __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3843 #if KMP_NESTED_HOT_TEAMS
3844     if( __kmp_hot_teams_max_level > 0 ) {  // need to free nested hot teams and their threads if any
3845         for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3846             kmp_info_t *th = hot_team->t.t_threads[i];
3847             if( __kmp_hot_teams_max_level > 1 ) {
3848                 n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3849             }
3850             if( th->th.th_hot_teams ) {
3851                 __kmp_free( th->th.th_hot_teams );
3852                 th->th.th_hot_teams = NULL;
3853             }
3854         }
3855     }
3856 #endif
3857     __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3858 
3859     //
3860     // Before we can reap the thread, we need to make certain that all
3861     // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3862     //
3863     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3864         __kmp_wait_to_unref_task_teams();
3865     }
3866 
3867     #if KMP_OS_WINDOWS
3868         /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3869         KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3870             (LPVOID)&(root->r.r_uber_thread->th),
3871             root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3872         __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3873     #endif /* KMP_OS_WINDOWS */
3874 
3875 #if OMPT_SUPPORT
3876     if (ompt_enabled &&
3877         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3878         int gtid = __kmp_get_gtid();
3879         __ompt_thread_end(ompt_thread_initial, gtid);
3880     }
3881 #endif
3882 
3883     TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3884     __kmp_reap_thread( root->r.r_uber_thread, 1 );
3885 
3886         // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3887     root->r.r_uber_thread = NULL;
3888     /* mark root as no longer in use */
3889     root->r.r_begin = FALSE;
3890 
3891     return n;
3892 }
3893 
3894 void
3895 __kmp_unregister_root_current_thread( int gtid )
3896 {
3897     KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3898     /* this lock should be ok, since unregister_root_current_thread is never called during
3899      * and abort, only during a normal close.  furthermore, if you have the
3900      * forkjoin lock, you should never try to get the initz lock */
3901 
3902     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3903     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3904         KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3905         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3906         return;
3907     }
3908     kmp_root_t *root = __kmp_root[gtid];
3909 
3910     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3911     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3912     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3913     KMP_ASSERT( root->r.r_active == FALSE );
3914 
3915 
3916     KMP_MB();
3917 
3918 #if OMP_45_ENABLED
3919    kmp_info_t * thread = __kmp_threads[gtid];
3920    kmp_team_t * team = thread->th.th_team;
3921    kmp_task_team_t *   task_team = thread->th.th_task_team;
3922 
3923    // we need to wait for the proxy tasks before finishing the thread
3924    if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3925 #if OMPT_SUPPORT
3926         // the runtime is shutting down so we won't report any events
3927         thread->th.ompt_thread_info.state = ompt_state_undefined;
3928 #endif
3929         __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3930    }
3931 #endif
3932 
3933     __kmp_reset_root(gtid, root);
3934 
3935     /* free up this thread slot */
3936     __kmp_gtid_set_specific( KMP_GTID_DNE );
3937 #ifdef KMP_TDATA_GTID
3938     __kmp_gtid = KMP_GTID_DNE;
3939 #endif
3940 
3941     KMP_MB();
3942     KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3943 
3944     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3945 }
3946 
3947 #if KMP_OS_WINDOWS
3948 /* __kmp_forkjoin_lock must be already held
3949    Unregisters a root thread that is not the current thread.  Returns the number of
3950    __kmp_threads entries freed as a result.
3951  */
3952 static int
3953 __kmp_unregister_root_other_thread( int gtid )
3954 {
3955     kmp_root_t *root = __kmp_root[gtid];
3956     int r;
3957 
3958     KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3959     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3960     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3961     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3962     KMP_ASSERT( root->r.r_active == FALSE );
3963 
3964     r = __kmp_reset_root(gtid, root);
3965     KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3966     return r;
3967 }
3968 #endif
3969 
3970 #if KMP_DEBUG
3971 void __kmp_task_info() {
3972 
3973     kmp_int32 gtid       = __kmp_entry_gtid();
3974     kmp_int32 tid        = __kmp_tid_from_gtid( gtid );
3975     kmp_info_t *this_thr = __kmp_threads[ gtid ];
3976     kmp_team_t *steam    = this_thr->th.th_serial_team;
3977     kmp_team_t *team     = this_thr->th.th_team;
3978 
3979     __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3980         gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3981 }
3982 #endif // KMP_DEBUG
3983 
3984 /* TODO optimize with one big memclr, take out what isn't needed,
3985  * split responsibility to workers as much as possible, and delay
3986  * initialization of features as much as possible  */
3987 static void
3988 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3989 {
3990     /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3991      * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3992     kmp_info_t *master = team->t.t_threads[0];
3993     KMP_DEBUG_ASSERT( this_thr != NULL );
3994     KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3995     KMP_DEBUG_ASSERT( team );
3996     KMP_DEBUG_ASSERT( team->t.t_threads  );
3997     KMP_DEBUG_ASSERT( team->t.t_dispatch );
3998     KMP_DEBUG_ASSERT( master );
3999     KMP_DEBUG_ASSERT( master->th.th_root );
4000 
4001     KMP_MB();
4002 
4003     TCW_SYNC_PTR(this_thr->th.th_team, team);
4004 
4005     this_thr->th.th_info.ds.ds_tid  = tid;
4006     this_thr->th.th_set_nproc       = 0;
4007 #if OMP_40_ENABLED
4008     this_thr->th.th_set_proc_bind   = proc_bind_default;
4009 # if KMP_AFFINITY_SUPPORTED
4010     this_thr->th.th_new_place       = this_thr->th.th_current_place;
4011 # endif
4012 #endif
4013     this_thr->th.th_root            = master->th.th_root;
4014 
4015     /* setup the thread's cache of the team structure */
4016     this_thr->th.th_team_nproc      = team->t.t_nproc;
4017     this_thr->th.th_team_master     = master;
4018     this_thr->th.th_team_serialized = team->t.t_serialized;
4019     TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4020 
4021     KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4022 
4023     KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4024                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4025 
4026     __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4027 
4028     KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4029                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4030     // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4031 
4032     /* TODO no worksharing in speculative threads */
4033     this_thr->th.th_dispatch      = &team->t.t_dispatch[ tid ];
4034 
4035     this_thr->th.th_local.this_construct = 0;
4036 
4037 #ifdef BUILD_TV
4038     this_thr->th.th_local.tv_data = 0;
4039 #endif
4040 
4041     if ( ! this_thr->th.th_pri_common ) {
4042         this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4043         if ( __kmp_storage_map ) {
4044             __kmp_print_storage_map_gtid(
4045                 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4046                 sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4047             );
4048         }; // if
4049         this_thr->th.th_pri_head = NULL;
4050     }; // if
4051 
4052     /* Initialize dynamic dispatch */
4053     {
4054         volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4055         /*
4056          * Use team max_nproc since this will never change for the team.
4057          */
4058         size_t disp_size = sizeof( dispatch_private_info_t ) *
4059             ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4060         KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4061         KMP_ASSERT( dispatch );
4062         KMP_DEBUG_ASSERT( team->t.t_dispatch );
4063         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4064 
4065         dispatch->th_disp_index = 0;
4066 #if OMP_45_ENABLED
4067         dispatch->th_doacross_buf_idx = 0;
4068 #endif
4069         if( ! dispatch->th_disp_buffer )  {
4070             dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4071 
4072             if ( __kmp_storage_map ) {
4073                 __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4074                                          &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4075                                          disp_size, "th_%d.th_dispatch.th_disp_buffer "
4076                                          "(team_%d.t_dispatch[%d].th_disp_buffer)",
4077                                          gtid, team->t.t_id, gtid );
4078             }
4079         } else {
4080             memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4081         }
4082 
4083         dispatch->th_dispatch_pr_current = 0;
4084         dispatch->th_dispatch_sh_current = 0;
4085 
4086         dispatch->th_deo_fcn = 0;             /* ORDERED     */
4087         dispatch->th_dxo_fcn = 0;             /* END ORDERED */
4088     }
4089 
4090     this_thr->th.th_next_pool = NULL;
4091 
4092     if (!this_thr->th.th_task_state_memo_stack) {
4093         size_t i;
4094         this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4095         this_thr->th.th_task_state_top = 0;
4096         this_thr->th.th_task_state_stack_sz = 4;
4097         for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4098             this_thr->th.th_task_state_memo_stack[i] = 0;
4099     }
4100 
4101     KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4102     KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4103 
4104     KMP_MB();
4105 }
4106 
4107 
4108 /* allocate a new thread for the requesting team.  this is only called from within a
4109  * forkjoin critical section.  we will first try to get an available thread from the
4110  * thread pool.  if none is available, we will fork a new one assuming we are able
4111  * to create a new one.  this should be assured, as the caller should check on this
4112  * first.
4113  */
4114 kmp_info_t *
4115 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4116 {
4117     kmp_team_t  *serial_team;
4118     kmp_info_t  *new_thr;
4119     int          new_gtid;
4120 
4121     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4122     KMP_DEBUG_ASSERT( root && team );
4123 #if !KMP_NESTED_HOT_TEAMS
4124     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4125 #endif
4126     KMP_MB();
4127 
4128     /* first, try to get one from the thread pool */
4129     if ( __kmp_thread_pool ) {
4130 
4131         new_thr = (kmp_info_t*)__kmp_thread_pool;
4132         __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4133         if ( new_thr == __kmp_thread_pool_insert_pt ) {
4134             __kmp_thread_pool_insert_pt = NULL;
4135         }
4136         TCW_4(new_thr->th.th_in_pool, FALSE);
4137         //
4138         // Don't touch th_active_in_pool or th_active.
4139         // The worker thread adjusts those flags as it sleeps/awakens.
4140         //
4141         __kmp_thread_pool_nth--;
4142 
4143         KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4144                     __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4145         KMP_ASSERT(       ! new_thr->th.th_team );
4146         KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4147         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4148 
4149         /* setup the thread structure */
4150         __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4151         KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4152 
4153         TCW_4(__kmp_nth, __kmp_nth + 1);
4154 
4155         new_thr->th.th_task_state = 0;
4156         new_thr->th.th_task_state_top = 0;
4157         new_thr->th.th_task_state_stack_sz = 4;
4158 
4159 #ifdef KMP_ADJUST_BLOCKTIME
4160         /* Adjust blocktime back to zero if necessar      y */
4161         /* Middle initialization might not have occurred yet */
4162         if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4163             if ( __kmp_nth > __kmp_avail_proc ) {
4164                 __kmp_zero_bt = TRUE;
4165             }
4166         }
4167 #endif /* KMP_ADJUST_BLOCKTIME */
4168 
4169 #if KMP_DEBUG
4170         // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4171         int b;
4172         kmp_balign_t * balign = new_thr->th.th_bar;
4173         for( b = 0; b < bs_last_barrier; ++ b )
4174             KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4175 #endif
4176 
4177         KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4178                     __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4179 
4180         KMP_MB();
4181         return new_thr;
4182     }
4183 
4184 
4185     /* no, well fork a new one */
4186     KMP_ASSERT( __kmp_nth    == __kmp_all_nth );
4187     KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4188 
4189 #if KMP_USE_MONITOR
4190     //
4191     // If this is the first worker thread the RTL is creating, then also
4192     // launch the monitor thread.  We try to do this as early as possible.
4193     //
4194     if ( ! TCR_4( __kmp_init_monitor ) ) {
4195         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4196         if ( ! TCR_4( __kmp_init_monitor ) ) {
4197             KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4198             TCW_4( __kmp_init_monitor, 1 );
4199             __kmp_create_monitor( & __kmp_monitor );
4200             KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4201             #if KMP_OS_WINDOWS
4202                 // AC: wait until monitor has started. This is a fix for CQ232808.
4203                 //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4204                 //     work in between, then there is high probability that monitor thread started after
4205                 //     the library shutdown. At shutdown it is too late to cope with the problem, because
4206                 //     when the master is in DllMain (process detach) the monitor has no chances to start
4207                 //     (it is blocked), and master has no means to inform the monitor that the library has gone,
4208                 //     because all the memory which the monitor can access is going to be released/reset.
4209                 while ( TCR_4(__kmp_init_monitor) < 2 ) {
4210                     KMP_YIELD( TRUE );
4211                 }
4212                 KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4213             #endif
4214         }
4215         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4216     }
4217 #endif
4218 
4219     KMP_MB();
4220     for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4221         KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4222     }
4223 
4224     /* allocate space for it. */
4225     new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4226 
4227     TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4228 
4229     if ( __kmp_storage_map ) {
4230         __kmp_print_thread_storage_map( new_thr, new_gtid );
4231     }
4232 
4233     /* add the reserve serialized team, initialized from the team's master thread */
4234     {
4235     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4236     KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4237 
4238     new_thr->th.th_serial_team = serial_team =
4239         (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4240 #if OMPT_SUPPORT
4241                                            0, // root parallel id
4242 #endif
4243 #if OMP_40_ENABLED
4244                                            proc_bind_default,
4245 #endif
4246                                            &r_icvs,
4247                                            0 USE_NESTED_HOT_ARG(NULL) );
4248     }
4249     KMP_ASSERT ( serial_team );
4250     serial_team->t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
4251     serial_team->t.t_threads[0] = new_thr;
4252     KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4253       new_thr ) );
4254 
4255     /* setup the thread structures */
4256     __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4257 
4258     #if USE_FAST_MEMORY
4259         __kmp_initialize_fast_memory( new_thr );
4260     #endif /* USE_FAST_MEMORY */
4261 
4262     #if KMP_USE_BGET
4263         KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4264         __kmp_initialize_bget( new_thr );
4265     #endif
4266 
4267     __kmp_init_random( new_thr );  // Initialize random number generator
4268 
4269     /* Initialize these only once when thread is grabbed for a team allocation */
4270     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4271                     __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4272 
4273     int b;
4274     kmp_balign_t * balign = new_thr->th.th_bar;
4275     for(b=0; b<bs_last_barrier; ++b) {
4276         balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4277         balign[b].bb.team = NULL;
4278         balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4279         balign[b].bb.use_oncore_barrier = 0;
4280     }
4281 
4282     new_thr->th.th_spin_here = FALSE;
4283     new_thr->th.th_next_waiting = 0;
4284 
4285 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4286     new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4287     new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4288     new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4289     new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4290 #endif
4291 
4292     TCW_4(new_thr->th.th_in_pool, FALSE);
4293     new_thr->th.th_active_in_pool = FALSE;
4294     TCW_4(new_thr->th.th_active, TRUE);
4295 
4296     /* adjust the global counters */
4297     __kmp_all_nth ++;
4298     __kmp_nth ++;
4299 
4300     //
4301     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4302     // for low numbers of procs, and method #2 (keyed API call) for higher
4303     // numbers of procs.
4304     //
4305     if ( __kmp_adjust_gtid_mode ) {
4306         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4307             if ( TCR_4(__kmp_gtid_mode) != 2) {
4308                 TCW_4(__kmp_gtid_mode, 2);
4309             }
4310         }
4311         else {
4312             if (TCR_4(__kmp_gtid_mode) != 1 ) {
4313                 TCW_4(__kmp_gtid_mode, 1);
4314             }
4315         }
4316     }
4317 
4318 #ifdef KMP_ADJUST_BLOCKTIME
4319     /* Adjust blocktime back to zero if necessary       */
4320     /* Middle initialization might not have occurred yet */
4321     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4322         if ( __kmp_nth > __kmp_avail_proc ) {
4323             __kmp_zero_bt = TRUE;
4324         }
4325     }
4326 #endif /* KMP_ADJUST_BLOCKTIME */
4327 
4328     /* actually fork it and create the new worker thread */
4329     KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4330     __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4331     KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4332 
4333     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4334     KMP_MB();
4335     return new_thr;
4336 }
4337 
4338 /*
4339  * reinitialize team for reuse.
4340  *
4341  * The hot team code calls this case at every fork barrier, so EPCC barrier
4342  * test are extremely sensitive to changes in it, esp. writes to the team
4343  * struct, which cause a cache invalidation in all threads.
4344  *
4345  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4346  */
4347 static void
4348 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4349     KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4350                     team->t.t_threads[0], team ) );
4351     KMP_DEBUG_ASSERT( team && new_icvs);
4352     KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4353     KMP_CHECK_UPDATE(team->t.t_ident, loc);
4354 
4355     KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4356 
4357     // Copy ICVs to the master thread's implicit taskdata
4358     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4359     copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4360 
4361     KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4362                     team->t.t_threads[0], team ) );
4363 }
4364 
4365 
4366 /* initialize the team data structure
4367  * this assumes the t_threads and t_max_nproc are already set
4368  * also, we don't touch the arguments */
4369 static void
4370 __kmp_initialize_team(
4371     kmp_team_t * team,
4372     int          new_nproc,
4373     kmp_internal_control_t * new_icvs,
4374     ident_t *                loc
4375 ) {
4376     KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4377 
4378     /* verify */
4379     KMP_DEBUG_ASSERT( team );
4380     KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4381     KMP_DEBUG_ASSERT( team->t.t_threads );
4382     KMP_MB();
4383 
4384     team->t.t_master_tid  = 0;    /* not needed */
4385     /* team->t.t_master_bar;        not needed */
4386     team->t.t_serialized  = new_nproc > 1 ? 0 : 1;
4387     team->t.t_nproc       = new_nproc;
4388 
4389     /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4390     team->t.t_next_pool   = NULL;
4391     /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4392 
4393     TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4394     team->t.t_invoke      = NULL; /* not needed */
4395 
4396     // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4397     team->t.t_sched       = new_icvs->sched;
4398 
4399 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4400     team->t.t_fp_control_saved = FALSE; /* not needed */
4401     team->t.t_x87_fpu_control_word = 0; /* not needed */
4402     team->t.t_mxcsr = 0;                /* not needed */
4403 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4404 
4405     team->t.t_construct   = 0;
4406     __kmp_init_lock( & team->t.t_single_lock );
4407 
4408     team->t.t_ordered .dt.t_value = 0;
4409     team->t.t_master_active = FALSE;
4410 
4411     memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4412 
4413 #ifdef KMP_DEBUG
4414     team->t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
4415 #endif
4416     team->t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
4417 
4418     team->t.t_control_stack_top = NULL;
4419 
4420     __kmp_reinitialize_team( team, new_icvs, loc );
4421 
4422     KMP_MB();
4423     KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4424 }
4425 
4426 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4427 /* Sets full mask for thread and returns old mask, no changes to structures. */
4428 static void
4429 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4430 {
4431     if ( KMP_AFFINITY_CAPABLE() ) {
4432         int status;
4433         if ( old_mask != NULL ) {
4434             status = __kmp_get_system_affinity( old_mask, TRUE );
4435             int error = errno;
4436             if ( status != 0 ) {
4437                 __kmp_msg(
4438                     kmp_ms_fatal,
4439                     KMP_MSG( ChangeThreadAffMaskError ),
4440                     KMP_ERR( error ),
4441                     __kmp_msg_null
4442                 );
4443             }
4444         }
4445         __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4446     }
4447 }
4448 #endif
4449 
4450 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4451 
4452 //
4453 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4454 // It calculats the worker + master thread's partition based upon the parent
4455 // thread's partition, and binds each worker to a thread in their partition.
4456 // The master thread's partition should already include its current binding.
4457 //
4458 static void
4459 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4460 {
4461     //
4462     // Copy the master thread's place partion to the team struct
4463     //
4464     kmp_info_t *master_th = team->t.t_threads[0];
4465     KMP_DEBUG_ASSERT( master_th != NULL );
4466     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4467     int first_place = master_th->th.th_first_place;
4468     int last_place = master_th->th.th_last_place;
4469     int masters_place = master_th->th.th_current_place;
4470     team->t.t_first_place = first_place;
4471     team->t.t_last_place = last_place;
4472 
4473     KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4474        proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4475        masters_place, first_place, last_place ) );
4476 
4477     switch ( proc_bind ) {
4478 
4479         case proc_bind_default:
4480         //
4481         // serial teams might have the proc_bind policy set to
4482         // proc_bind_default.  It doesn't matter, as we don't
4483         // rebind the master thread for any proc_bind policy.
4484         //
4485         KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4486         break;
4487 
4488         case proc_bind_master:
4489         {
4490             int f;
4491             int n_th = team->t.t_nproc;
4492             for ( f = 1; f < n_th; f++ ) {
4493                 kmp_info_t *th = team->t.t_threads[f];
4494                 KMP_DEBUG_ASSERT( th != NULL );
4495                 th->th.th_first_place = first_place;
4496                 th->th.th_last_place = last_place;
4497                 th->th.th_new_place = masters_place;
4498 
4499                 KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4500                   __kmp_gtid_from_thread( team->t.t_threads[f] ),
4501                   team->t.t_id, f, masters_place, first_place, last_place ) );
4502             }
4503         }
4504         break;
4505 
4506         case proc_bind_close:
4507         {
4508             int f;
4509             int n_th = team->t.t_nproc;
4510             int n_places;
4511             if ( first_place <= last_place ) {
4512                 n_places = last_place - first_place + 1;
4513             }
4514             else {
4515                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4516             }
4517             if ( n_th <= n_places ) {
4518                 int place = masters_place;
4519                 for ( f = 1; f < n_th; f++ ) {
4520                     kmp_info_t *th = team->t.t_threads[f];
4521                     KMP_DEBUG_ASSERT( th != NULL );
4522 
4523                     if ( place == last_place ) {
4524                         place = first_place;
4525                     }
4526                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4527                         place = 0;
4528                     }
4529                     else {
4530                         place++;
4531                     }
4532                     th->th.th_first_place = first_place;
4533                     th->th.th_last_place = last_place;
4534                     th->th.th_new_place = place;
4535 
4536                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4537                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4538                        team->t.t_id, f, place, first_place, last_place ) );
4539                 }
4540             }
4541             else {
4542                 int S, rem, gap, s_count;
4543                 S = n_th / n_places;
4544                 s_count = 0;
4545                 rem = n_th - ( S * n_places );
4546                 gap = rem > 0 ? n_places/rem : n_places;
4547                 int place = masters_place;
4548                 int gap_ct = gap;
4549                 for ( f = 0; f < n_th; f++ ) {
4550                     kmp_info_t *th = team->t.t_threads[f];
4551                     KMP_DEBUG_ASSERT( th != NULL );
4552 
4553                     th->th.th_first_place = first_place;
4554                     th->th.th_last_place = last_place;
4555                     th->th.th_new_place = place;
4556                     s_count++;
4557 
4558                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4559                         // do nothing, add an extra thread to place on next iteration
4560                     }
4561                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4562                         // we added an extra thread to this place; move to next place
4563                         if ( place == last_place ) {
4564                             place = first_place;
4565                         }
4566                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4567                             place = 0;
4568                         }
4569                         else {
4570                             place++;
4571                         }
4572                         s_count = 0;
4573                         gap_ct = 1;
4574                         rem--;
4575                     }
4576                     else if (s_count == S) { // place full; don't add extra
4577                         if ( place == last_place ) {
4578                             place = first_place;
4579                         }
4580                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4581                             place = 0;
4582                         }
4583                         else {
4584                             place++;
4585                         }
4586                         gap_ct++;
4587                         s_count = 0;
4588                     }
4589 
4590                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4591                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4592                       team->t.t_id, f, th->th.th_new_place, first_place,
4593                       last_place ) );
4594                 }
4595                 KMP_DEBUG_ASSERT( place == masters_place );
4596             }
4597         }
4598         break;
4599 
4600         case proc_bind_spread:
4601         {
4602             int f;
4603             int n_th = team->t.t_nproc;
4604             int n_places;
4605             int thidx;
4606             if ( first_place <= last_place ) {
4607                 n_places = last_place - first_place + 1;
4608             }
4609             else {
4610                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4611             }
4612             if ( n_th <= n_places ) {
4613                 int place = masters_place;
4614                 int S = n_places/n_th;
4615                 int s_count, rem, gap, gap_ct;
4616                 rem = n_places - n_th*S;
4617                 gap = rem ? n_th/rem : 1;
4618                 gap_ct = gap;
4619                 thidx = n_th;
4620                 if (update_master_only == 1)
4621                     thidx = 1;
4622                 for ( f = 0; f < thidx; f++ ) {
4623                     kmp_info_t *th = team->t.t_threads[f];
4624                     KMP_DEBUG_ASSERT( th != NULL );
4625 
4626                     th->th.th_first_place = place;
4627                     th->th.th_new_place = place;
4628                     s_count = 1;
4629                     while (s_count < S) {
4630                         if ( place == last_place ) {
4631                             place = first_place;
4632                         }
4633                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4634                             place = 0;
4635                         }
4636                         else {
4637                             place++;
4638                         }
4639                         s_count++;
4640                     }
4641                     if (rem && (gap_ct == gap)) {
4642                         if ( place == last_place ) {
4643                             place = first_place;
4644                         }
4645                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4646                             place = 0;
4647                         }
4648                         else {
4649                             place++;
4650                         }
4651                         rem--;
4652                         gap_ct = 0;
4653                     }
4654                     th->th.th_last_place = place;
4655                     gap_ct++;
4656 
4657                     if ( place == last_place ) {
4658                         place = first_place;
4659                     }
4660                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4661                         place = 0;
4662                     }
4663                     else {
4664                         place++;
4665                     }
4666 
4667                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4668                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4669                       team->t.t_id, f, th->th.th_new_place,
4670                       th->th.th_first_place, th->th.th_last_place ) );
4671                 }
4672                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4673             }
4674             else {
4675                 int S, rem, gap, s_count;
4676                 S = n_th / n_places;
4677                 s_count = 0;
4678                 rem = n_th - ( S * n_places );
4679                 gap = rem > 0 ? n_places/rem : n_places;
4680                 int place = masters_place;
4681                 int gap_ct = gap;
4682                 thidx = n_th;
4683                 if (update_master_only == 1)
4684                     thidx = 1;
4685                 for ( f = 0; f < thidx; f++ ) {
4686                     kmp_info_t *th = team->t.t_threads[f];
4687                     KMP_DEBUG_ASSERT( th != NULL );
4688 
4689                     th->th.th_first_place = place;
4690                     th->th.th_last_place = place;
4691                     th->th.th_new_place = place;
4692                     s_count++;
4693 
4694                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4695                         // do nothing, add an extra thread to place on next iteration
4696                     }
4697                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4698                         // we added an extra thread to this place; move on to next place
4699                         if ( place == last_place ) {
4700                             place = first_place;
4701                         }
4702                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4703                             place = 0;
4704                         }
4705                         else {
4706                             place++;
4707                         }
4708                         s_count = 0;
4709                         gap_ct = 1;
4710                         rem--;
4711                     }
4712                     else if (s_count == S) { // place is full; don't add extra thread
4713                         if ( place == last_place ) {
4714                             place = first_place;
4715                         }
4716                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4717                             place = 0;
4718                         }
4719                         else {
4720                             place++;
4721                         }
4722                         gap_ct++;
4723                         s_count = 0;
4724                     }
4725 
4726                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4727                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4728                        team->t.t_id, f, th->th.th_new_place,
4729                        th->th.th_first_place, th->th.th_last_place) );
4730                 }
4731                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4732             }
4733         }
4734         break;
4735 
4736         default:
4737         break;
4738     }
4739 
4740     KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4741 }
4742 
4743 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4744 
4745 /* allocate a new team data structure to use.  take one off of the free pool if available */
4746 kmp_team_t *
4747 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4748 #if OMPT_SUPPORT
4749     ompt_parallel_id_t ompt_parallel_id,
4750 #endif
4751 #if OMP_40_ENABLED
4752     kmp_proc_bind_t new_proc_bind,
4753 #endif
4754     kmp_internal_control_t *new_icvs,
4755     int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4756 {
4757     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4758     int f;
4759     kmp_team_t *team;
4760     int use_hot_team = ! root->r.r_active;
4761     int level = 0;
4762 
4763     KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4764     KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4765     KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4766     KMP_MB();
4767 
4768 #if KMP_NESTED_HOT_TEAMS
4769     kmp_hot_team_ptr_t *hot_teams;
4770     if( master ) {
4771         team = master->th.th_team;
4772         level = team->t.t_active_level;
4773         if( master->th.th_teams_microtask ) {                         // in teams construct?
4774             if( master->th.th_teams_size.nteams > 1 && (             // #teams > 1
4775                 team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4776                 master->th.th_teams_level < team->t.t_level ) ) {    // or nested parallel inside the teams
4777                 ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4778             }
4779         }
4780         hot_teams = master->th.th_hot_teams;
4781         if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4782         {   // hot team has already been allocated for given level
4783             use_hot_team = 1;
4784         } else {
4785             use_hot_team = 0;
4786         }
4787     }
4788 #endif
4789     // Optimization to use a "hot" team
4790     if( use_hot_team && new_nproc > 1 ) {
4791         KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4792 #if KMP_NESTED_HOT_TEAMS
4793         team = hot_teams[level].hot_team;
4794 #else
4795         team =  root->r.r_hot_team;
4796 #endif
4797 #if KMP_DEBUG
4798         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4799             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4800                            team->t.t_task_team[0], team->t.t_task_team[1] ));
4801         }
4802 #endif
4803 
4804         // Has the number of threads changed?
4805         /* Let's assume the most common case is that the number of threads is unchanged, and
4806            put that case first. */
4807         if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4808             KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4809             // This case can mean that omp_set_num_threads() was called and the hot team size
4810             // was already reduced, so we check the special flag
4811             if ( team->t.t_size_changed == -1 ) {
4812                 team->t.t_size_changed = 1;
4813             } else {
4814                 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4815             }
4816 
4817             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4818             kmp_r_sched_t new_sched = new_icvs->sched;
4819             if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4820                 team->t.t_sched.chunk != new_sched.chunk)
4821                 team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4822 
4823             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4824 
4825             KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4826                            0, team->t.t_threads[0], team ) );
4827             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4828 
4829 #if OMP_40_ENABLED
4830 # if KMP_AFFINITY_SUPPORTED
4831             if ( ( team->t.t_size_changed == 0 )
4832               && ( team->t.t_proc_bind == new_proc_bind ) ) {
4833                 if (new_proc_bind == proc_bind_spread) {
4834                     __kmp_partition_places(team, 1); // add flag to update only master for spread
4835                 }
4836                 KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4837                   team->t.t_id, new_proc_bind, team->t.t_first_place,
4838                   team->t.t_last_place ) );
4839             }
4840             else {
4841                 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4842                 __kmp_partition_places( team );
4843             }
4844 # else
4845             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4846 # endif /* KMP_AFFINITY_SUPPORTED */
4847 #endif /* OMP_40_ENABLED */
4848         }
4849         else if( team->t.t_nproc > new_nproc ) {
4850             KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4851 
4852             team->t.t_size_changed = 1;
4853 #if KMP_NESTED_HOT_TEAMS
4854             if( __kmp_hot_teams_mode == 0 ) {
4855                 // AC: saved number of threads should correspond to team's value in this mode,
4856                 // can be bigger in mode 1, when hot team has some threads in reserve
4857                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4858                 hot_teams[level].hot_team_nth = new_nproc;
4859 #endif // KMP_NESTED_HOT_TEAMS
4860                 /* release the extra threads we don't need any more */
4861                 for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
4862                     KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4863                     if ( __kmp_tasking_mode != tskm_immediate_exec) {
4864                         // When decreasing team size, threads no longer in the team should unref task team.
4865                         team->t.t_threads[f]->th.th_task_team = NULL;
4866                     }
4867                     __kmp_free_thread( team->t.t_threads[ f ] );
4868                     team->t.t_threads[ f ] = NULL;
4869                 }
4870 #if KMP_NESTED_HOT_TEAMS
4871             } // (__kmp_hot_teams_mode == 0)
4872             else {
4873                 // When keeping extra threads in team, switch threads to wait on own b_go flag
4874                 for (f=new_nproc; f<team->t.t_nproc; ++f) {
4875                     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4876                     kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4877                     for (int b=0; b<bs_last_barrier; ++b) {
4878                         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4879                             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4880                         }
4881                         KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4882                     }
4883                 }
4884             }
4885 #endif // KMP_NESTED_HOT_TEAMS
4886             team->t.t_nproc =  new_nproc;
4887             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4888             if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4889                 team->t.t_sched.chunk != new_icvs->sched.chunk)
4890                 team->t.t_sched = new_icvs->sched;
4891             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4892 
4893             /* update the remaining threads */
4894             for(f = 0; f < new_nproc; ++f) {
4895                 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4896             }
4897             // restore the current task state of the master thread: should be the implicit task
4898             KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4899                        0, team->t.t_threads[0], team ) );
4900 
4901             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4902 
4903 #ifdef KMP_DEBUG
4904             for ( f = 0; f < team->t.t_nproc; f++ ) {
4905                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4906                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4907             }
4908 #endif
4909 
4910 #if OMP_40_ENABLED
4911             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4912 # if KMP_AFFINITY_SUPPORTED
4913             __kmp_partition_places( team );
4914 # endif
4915 #endif
4916         }
4917         else { // team->t.t_nproc < new_nproc
4918 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4919             kmp_affin_mask_t *old_mask;
4920             if ( KMP_AFFINITY_CAPABLE() ) {
4921                 KMP_CPU_ALLOC(old_mask);
4922             }
4923 #endif
4924 
4925             KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4926 
4927             team->t.t_size_changed = 1;
4928 
4929 #if KMP_NESTED_HOT_TEAMS
4930             int avail_threads = hot_teams[level].hot_team_nth;
4931             if( new_nproc < avail_threads )
4932                 avail_threads = new_nproc;
4933             kmp_info_t **other_threads = team->t.t_threads;
4934             for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4935                 // Adjust barrier data of reserved threads (if any) of the team
4936                 // Other data will be set in __kmp_initialize_info() below.
4937                 int b;
4938                 kmp_balign_t * balign = other_threads[f]->th.th_bar;
4939                 for ( b = 0; b < bs_last_barrier; ++ b ) {
4940                     balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4941                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4942 #if USE_DEBUGGER
4943                     balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4944 #endif
4945                 }
4946             }
4947             if( hot_teams[level].hot_team_nth >= new_nproc ) {
4948                 // we have all needed threads in reserve, no need to allocate any
4949                 // this only possible in mode 1, cannot have reserved threads in mode 0
4950                 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4951                 team->t.t_nproc = new_nproc;                     // just get reserved threads involved
4952             } else {
4953                 // we may have some threads in reserve, but not enough
4954                 team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4955                 hot_teams[level].hot_team_nth = new_nproc;       // adjust hot team max size
4956 #endif // KMP_NESTED_HOT_TEAMS
4957             if(team->t.t_max_nproc < new_nproc) {
4958                 /* reallocate larger arrays */
4959                 __kmp_reallocate_team_arrays(team, new_nproc);
4960                 __kmp_reinitialize_team( team, new_icvs, NULL );
4961             }
4962 
4963 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4964             /* Temporarily set full mask for master thread before
4965                creation of workers. The reason is that workers inherit
4966                the affinity from master, so if a lot of workers are
4967                created on the single core quickly, they don't get
4968                a chance to set their own affinity for a long time.
4969             */
4970             __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4971 #endif
4972 
4973             /* allocate new threads for the hot team */
4974             for( f = team->t.t_nproc  ;  f < new_nproc  ;  f++ ) {
4975                 kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4976                 KMP_DEBUG_ASSERT( new_worker );
4977                 team->t.t_threads[ f ] = new_worker;
4978 
4979                 KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4980                                 team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4981                                 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4982                                 team->t.t_bar[bs_plain_barrier].b_arrived ) );
4983 
4984                 { // Initialize barrier data for new threads.
4985                     int b;
4986                     kmp_balign_t * balign = new_worker->th.th_bar;
4987                     for( b = 0; b < bs_last_barrier; ++ b ) {
4988                         balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
4989                         KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4990 #if USE_DEBUGGER
4991                         balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4992 #endif
4993                     }
4994                 }
4995             }
4996 
4997 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4998             if ( KMP_AFFINITY_CAPABLE() ) {
4999                 /* Restore initial master thread's affinity mask */
5000                 __kmp_set_system_affinity( old_mask, TRUE );
5001                 KMP_CPU_FREE(old_mask);
5002             }
5003 #endif
5004 #if KMP_NESTED_HOT_TEAMS
5005             } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5006 #endif // KMP_NESTED_HOT_TEAMS
5007             /* make sure everyone is syncronized */
5008             int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
5009             __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5010 
5011             /* reinitialize the threads */
5012             KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5013             for (f=0;  f < team->t.t_nproc; ++f)
5014                 __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5015             if (level) { // set th_task_state for new threads in nested hot team
5016                 // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
5017                 // th_task_state for the new threads. th_task_state for master thread will not be accurate until
5018                 // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
5019                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5020                     team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5021             }
5022             else { // set th_task_state for new threads in non-nested hot team
5023                 int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5024                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5025                     team->t.t_threads[f]->th.th_task_state = old_state;
5026             }
5027 
5028 #ifdef KMP_DEBUG
5029             for ( f = 0; f < team->t.t_nproc; ++ f ) {
5030                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5031                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5032             }
5033 #endif
5034 
5035 #if OMP_40_ENABLED
5036             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5037 # if KMP_AFFINITY_SUPPORTED
5038             __kmp_partition_places( team );
5039 # endif
5040 #endif
5041         } // Check changes in number of threads
5042 
5043 #if OMP_40_ENABLED
5044         kmp_info_t *master = team->t.t_threads[0];
5045         if( master->th.th_teams_microtask ) {
5046             for( f = 1; f < new_nproc; ++f ) {
5047                 // propagate teams construct specific info to workers
5048                 kmp_info_t *thr = team->t.t_threads[f];
5049                 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5050                 thr->th.th_teams_level     = master->th.th_teams_level;
5051                 thr->th.th_teams_size      = master->th.th_teams_size;
5052             }
5053         }
5054 #endif /* OMP_40_ENABLED */
5055 #if KMP_NESTED_HOT_TEAMS
5056         if( level ) {
5057             // Sync barrier state for nested hot teams, not needed for outermost hot team.
5058             for( f = 1; f < new_nproc; ++f ) {
5059                 kmp_info_t *thr = team->t.t_threads[f];
5060                 int b;
5061                 kmp_balign_t * balign = thr->th.th_bar;
5062                 for( b = 0; b < bs_last_barrier; ++ b ) {
5063                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
5064                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5065 #if USE_DEBUGGER
5066                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5067 #endif
5068                 }
5069             }
5070         }
5071 #endif // KMP_NESTED_HOT_TEAMS
5072 
5073         /* reallocate space for arguments if necessary */
5074         __kmp_alloc_argv_entries( argc, team, TRUE );
5075         KMP_CHECK_UPDATE(team->t.t_argc, argc);
5076         //
5077         // The hot team re-uses the previous task team,
5078         // if untouched during the previous release->gather phase.
5079         //
5080 
5081         KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5082 
5083 #if KMP_DEBUG
5084         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5085             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5086                            team->t.t_task_team[0], team->t.t_task_team[1] ));
5087         }
5088 #endif
5089 
5090 #if OMPT_SUPPORT
5091         __ompt_team_assign_id(team, ompt_parallel_id);
5092 #endif
5093 
5094         KMP_MB();
5095 
5096         return team;
5097     }
5098 
5099     /* next, let's try to take one from the team pool */
5100     KMP_MB();
5101     for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5102     {
5103         /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5104         if ( team->t.t_max_nproc >= max_nproc ) {
5105             /* take this team from the team pool */
5106             __kmp_team_pool = team->t.t_next_pool;
5107 
5108             /* setup the team for fresh use */
5109             __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5110 
5111             KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5112                             &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5113             team->t.t_task_team[0] = NULL;
5114             team->t.t_task_team[1] = NULL;
5115 
5116             /* reallocate space for arguments if necessary */
5117             __kmp_alloc_argv_entries( argc, team, TRUE );
5118             KMP_CHECK_UPDATE(team->t.t_argc, argc);
5119 
5120             KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5121                             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5122             { // Initialize barrier data.
5123                 int b;
5124                 for ( b = 0; b < bs_last_barrier; ++ b) {
5125                     team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5126 #if USE_DEBUGGER
5127                     team->t.t_bar[ b ].b_master_arrived = 0;
5128                     team->t.t_bar[ b ].b_team_arrived   = 0;
5129 #endif
5130                 }
5131             }
5132 
5133 #if OMP_40_ENABLED
5134             team->t.t_proc_bind = new_proc_bind;
5135 #endif
5136 
5137             KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5138 
5139 #if OMPT_SUPPORT
5140             __ompt_team_assign_id(team, ompt_parallel_id);
5141 #endif
5142 
5143             KMP_MB();
5144 
5145             return team;
5146         }
5147 
5148         /* reap team if it is too small, then loop back and check the next one */
5149         /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5150         /* TODO: Use technique to find the right size hot-team, don't reap them */
5151         team =  __kmp_reap_team( team );
5152         __kmp_team_pool = team;
5153     }
5154 
5155     /* nothing available in the pool, no matter, make a new team! */
5156     KMP_MB();
5157     team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5158 
5159     /* and set it up */
5160     team->t.t_max_nproc   = max_nproc;
5161     /* NOTE well, for some reason allocating one big buffer and dividing it
5162      * up seems to really hurt performance a lot on the P4, so, let's not use
5163      * this... */
5164     __kmp_allocate_team_arrays( team, max_nproc );
5165 
5166     KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5167     __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5168 
5169     KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5170                     &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5171     team->t.t_task_team[0] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5172     team->t.t_task_team[1] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5173 
5174     if ( __kmp_storage_map ) {
5175         __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5176     }
5177 
5178     /* allocate space for arguments */
5179     __kmp_alloc_argv_entries( argc, team, FALSE );
5180     team->t.t_argc        = argc;
5181 
5182     KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5183                     team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5184     { // Initialize barrier data.
5185         int b;
5186         for ( b = 0; b < bs_last_barrier; ++ b ) {
5187             team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5188 #if USE_DEBUGGER
5189             team->t.t_bar[ b ].b_master_arrived = 0;
5190             team->t.t_bar[ b ].b_team_arrived   = 0;
5191 #endif
5192         }
5193     }
5194 
5195 #if OMP_40_ENABLED
5196     team->t.t_proc_bind = new_proc_bind;
5197 #endif
5198 
5199 #if OMPT_SUPPORT
5200     __ompt_team_assign_id(team, ompt_parallel_id);
5201     team->t.ompt_serialized_team_info = NULL;
5202 #endif
5203 
5204     KMP_MB();
5205 
5206     KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5207 
5208     return team;
5209 }
5210 
5211 /* TODO implement hot-teams at all levels */
5212 /* TODO implement lazy thread release on demand (disband request) */
5213 
5214 /* free the team.  return it to the team pool.  release all the threads
5215  * associated with it */
5216 void
5217 __kmp_free_team( kmp_root_t *root, kmp_team_t *team  USE_NESTED_HOT_ARG(kmp_info_t *master) )
5218 {
5219     int f;
5220     KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5221 
5222     /* verify state */
5223     KMP_DEBUG_ASSERT( root );
5224     KMP_DEBUG_ASSERT( team );
5225     KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5226     KMP_DEBUG_ASSERT( team->t.t_threads );
5227 
5228     int use_hot_team = team == root->r.r_hot_team;
5229 #if KMP_NESTED_HOT_TEAMS
5230     int level;
5231     kmp_hot_team_ptr_t *hot_teams;
5232     if( master ) {
5233         level = team->t.t_active_level - 1;
5234         if( master->th.th_teams_microtask ) {                         // in teams construct?
5235             if( master->th.th_teams_size.nteams > 1 ) {
5236                ++level; // level was not increased in teams construct for team_of_masters
5237             }
5238             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5239                 master->th.th_teams_level == team->t.t_level ) {
5240                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
5241             }            // team->t.t_level will be increased inside parallel
5242         }
5243         hot_teams = master->th.th_hot_teams;
5244         if( level < __kmp_hot_teams_max_level ) {
5245             KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5246             use_hot_team = 1;
5247         }
5248     }
5249 #endif // KMP_NESTED_HOT_TEAMS
5250 
5251     /* team is done working */
5252     TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5253     team->t.t_copyin_counter = 0; // init counter for possible reuse
5254     // Do not reset pointer to parent team to NULL for hot teams.
5255 
5256     /* if we are non-hot team, release our threads */
5257     if( ! use_hot_team ) {
5258         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5259             // Delete task teams
5260             int tt_idx;
5261             for (tt_idx=0; tt_idx<2; ++tt_idx) {
5262                 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5263                 if ( task_team != NULL ) {
5264                     for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5265                         team->t.t_threads[f]->th.th_task_team = NULL;
5266                     }
5267                     KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5268 #if KMP_NESTED_HOT_TEAMS
5269                     __kmp_free_task_team( master, task_team );
5270 #endif
5271                     team->t.t_task_team[tt_idx] = NULL;
5272                 }
5273             }
5274         }
5275 
5276         // Reset pointer to parent team only for non-hot teams.
5277         team->t.t_parent = NULL;
5278         team->t.t_level = 0;
5279         team->t.t_active_level = 0;
5280 
5281         /* free the worker threads */
5282         for ( f = 1; f < team->t.t_nproc; ++ f ) {
5283             KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5284             __kmp_free_thread( team->t.t_threads[ f ] );
5285             team->t.t_threads[ f ] = NULL;
5286         }
5287 
5288         /* put the team back in the team pool */
5289         /* TODO limit size of team pool, call reap_team if pool too large */
5290         team->t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
5291         __kmp_team_pool        = (volatile kmp_team_t*) team;
5292     }
5293 
5294     KMP_MB();
5295 }
5296 
5297 
5298 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5299 kmp_team_t *
5300 __kmp_reap_team( kmp_team_t *team )
5301 {
5302     kmp_team_t *next_pool = team->t.t_next_pool;
5303 
5304     KMP_DEBUG_ASSERT( team );
5305     KMP_DEBUG_ASSERT( team->t.t_dispatch    );
5306     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5307     KMP_DEBUG_ASSERT( team->t.t_threads     );
5308     KMP_DEBUG_ASSERT( team->t.t_argv        );
5309 
5310     /* TODO clean the threads that are a part of this? */
5311 
5312     /* free stuff */
5313 
5314     __kmp_free_team_arrays( team );
5315     if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5316         __kmp_free( (void*) team->t.t_argv );
5317     __kmp_free( team );
5318 
5319     KMP_MB();
5320     return next_pool;
5321 }
5322 
5323 //
5324 // Free the thread.  Don't reap it, just place it on the pool of available
5325 // threads.
5326 //
5327 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5328 // binding for the affinity mechanism to be useful.
5329 //
5330 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5331 // However, we want to avoid a potential performance problem by always
5332 // scanning through the list to find the correct point at which to insert
5333 // the thread (potential N**2 behavior).  To do this we keep track of the
5334 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5335 // With single-level parallelism, threads will always be added to the tail
5336 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5337 // parallelism, all bets are off and we may need to scan through the entire
5338 // free list.
5339 //
5340 // This change also has a potentially large performance benefit, for some
5341 // applications.  Previously, as threads were freed from the hot team, they
5342 // would be placed back on the free list in inverse order.  If the hot team
5343 // grew back to it's original size, then the freed thread would be placed
5344 // back on the hot team in reverse order.  This could cause bad cache
5345 // locality problems on programs where the size of the hot team regularly
5346 // grew and shrunk.
5347 //
5348 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5349 //
5350 void
5351 __kmp_free_thread( kmp_info_t *this_th )
5352 {
5353     int gtid;
5354     kmp_info_t **scan;
5355 
5356     KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5357                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5358 
5359     KMP_DEBUG_ASSERT( this_th );
5360 
5361     // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5362     int b;
5363     kmp_balign_t *balign = this_th->th.th_bar;
5364     for (b=0; b<bs_last_barrier; ++b) {
5365         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5366             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5367         balign[b].bb.team = NULL;
5368         balign[b].bb.leaf_kids = 0;
5369     }
5370     this_th->th.th_task_state = 0;
5371 
5372     /* put thread back on the free pool */
5373     TCW_PTR(this_th->th.th_team, NULL);
5374     TCW_PTR(this_th->th.th_root, NULL);
5375     TCW_PTR(this_th->th.th_dispatch, NULL);               /* NOT NEEDED */
5376 
5377     //
5378     // If the __kmp_thread_pool_insert_pt is already past the new insert
5379     // point, then we need to re-scan the entire list.
5380     //
5381     gtid = this_th->th.th_info.ds.ds_gtid;
5382     if ( __kmp_thread_pool_insert_pt != NULL ) {
5383         KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5384         if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5385              __kmp_thread_pool_insert_pt = NULL;
5386         }
5387     }
5388 
5389     //
5390     // Scan down the list to find the place to insert the thread.
5391     // scan is the address of a link in the list, possibly the address of
5392     // __kmp_thread_pool itself.
5393     //
5394     // In the absence of nested parallism, the for loop will have 0 iterations.
5395     //
5396     if ( __kmp_thread_pool_insert_pt != NULL ) {
5397         scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5398     }
5399     else {
5400         scan = (kmp_info_t **)&__kmp_thread_pool;
5401     }
5402     for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5403       scan = &( (*scan)->th.th_next_pool ) );
5404 
5405     //
5406     // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5407     // to its address.
5408     //
5409     TCW_PTR(this_th->th.th_next_pool, *scan);
5410     __kmp_thread_pool_insert_pt = *scan = this_th;
5411     KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5412       || ( this_th->th.th_info.ds.ds_gtid
5413       < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5414     TCW_4(this_th->th.th_in_pool, TRUE);
5415     __kmp_thread_pool_nth++;
5416 
5417     TCW_4(__kmp_nth, __kmp_nth - 1);
5418 
5419 #ifdef KMP_ADJUST_BLOCKTIME
5420     /* Adjust blocktime back to user setting or default if necessary */
5421     /* Middle initialization might never have occurred                */
5422     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5423         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5424         if ( __kmp_nth <= __kmp_avail_proc ) {
5425             __kmp_zero_bt = FALSE;
5426         }
5427     }
5428 #endif /* KMP_ADJUST_BLOCKTIME */
5429 
5430     KMP_MB();
5431 }
5432 
5433 
5434 /* ------------------------------------------------------------------------ */
5435 
5436 void *
5437 __kmp_launch_thread( kmp_info_t *this_thr )
5438 {
5439     int                   gtid = this_thr->th.th_info.ds.ds_gtid;
5440 /*    void                 *stack_data;*/
5441     kmp_team_t *(*volatile pteam);
5442 
5443     KMP_MB();
5444     KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5445 
5446     if( __kmp_env_consistency_check ) {
5447         this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
5448     }
5449 
5450 #if OMPT_SUPPORT
5451     if (ompt_enabled) {
5452         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5453         this_thr->th.ompt_thread_info.wait_id = 0;
5454         this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5455         if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5456             __ompt_thread_begin(ompt_thread_worker, gtid);
5457         }
5458     }
5459 #endif
5460 
5461     /* This is the place where threads wait for work */
5462     while( ! TCR_4(__kmp_global.g.g_done) ) {
5463         KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5464         KMP_MB();
5465 
5466         /* wait for work to do */
5467         KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5468 
5469 #if OMPT_SUPPORT
5470         if (ompt_enabled) {
5471             this_thr->th.ompt_thread_info.state = ompt_state_idle;
5472         }
5473 #endif
5474 
5475         /* No tid yet since not part of a team */
5476         __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5477 
5478 #if OMPT_SUPPORT
5479         if (ompt_enabled) {
5480             this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5481         }
5482 #endif
5483 
5484         pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5485 
5486         /* have we been allocated? */
5487         if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5488 #if OMPT_SUPPORT
5489             ompt_task_info_t *task_info;
5490             ompt_parallel_id_t my_parallel_id;
5491             if (ompt_enabled) {
5492                 task_info = __ompt_get_taskinfo(0);
5493                 my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5494             }
5495 #endif
5496             /* we were just woken up, so run our new task */
5497             if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5498                 int rc;
5499                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5500                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5501 
5502                 updateHWFPControl (*pteam);
5503 
5504 #if OMPT_SUPPORT
5505                 if (ompt_enabled) {
5506                     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5507                     // Initialize OMPT task id for implicit task.
5508                     int tid = __kmp_tid_from_gtid(gtid);
5509                     task_info->task_id = __ompt_task_id_new(tid);
5510                 }
5511 #endif
5512 
5513                 {
5514                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5515                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5516                     rc = (*pteam)->t.t_invoke( gtid );
5517                 }
5518                 KMP_ASSERT( rc );
5519 
5520 #if OMPT_SUPPORT
5521                 if (ompt_enabled) {
5522                     /* no frame set while outside task */
5523                     task_info->frame.exit_runtime_frame = NULL;
5524 
5525                     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5526                 }
5527 #endif
5528                 KMP_MB();
5529                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5530                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5531             }
5532             /* join barrier after parallel region */
5533             __kmp_join_barrier( gtid );
5534 #if OMPT_SUPPORT && OMPT_TRACE
5535             if (ompt_enabled) {
5536                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5537                     // don't access *pteam here: it may have already been freed
5538                     // by the master thread behind the barrier (possible race)
5539                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5540                         my_parallel_id, task_info->task_id);
5541                 }
5542                 task_info->frame.exit_runtime_frame = NULL;
5543                 task_info->task_id = 0;
5544             }
5545 #endif
5546         }
5547     }
5548     TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5549 
5550 #if OMPT_SUPPORT
5551     if (ompt_enabled &&
5552         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5553         __ompt_thread_end(ompt_thread_worker, gtid);
5554     }
5555 #endif
5556 
5557     this_thr->th.th_task_team = NULL;
5558     /* run the destructors for the threadprivate data for this thread */
5559     __kmp_common_destroy_gtid( gtid );
5560 
5561     KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5562     KMP_MB();
5563     return this_thr;
5564 }
5565 
5566 /* ------------------------------------------------------------------------ */
5567 /* ------------------------------------------------------------------------ */
5568 
5569 void
5570 __kmp_internal_end_dest( void *specific_gtid )
5571 {
5572     #if KMP_COMPILER_ICC
5573         #pragma warning( push )
5574         #pragma warning( disable:  810 ) // conversion from "void *" to "int" may lose significant bits
5575     #endif
5576     // Make sure no significant bits are lost
5577     int gtid = (kmp_intptr_t)specific_gtid - 1;
5578     #if KMP_COMPILER_ICC
5579         #pragma warning( pop )
5580     #endif
5581 
5582     KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5583     /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5584      * this is because 0 is reserved for the nothing-stored case */
5585 
5586     /* josh: One reason for setting the gtid specific data even when it is being
5587        destroyed by pthread is to allow gtid lookup through thread specific data
5588        (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5589        that gets executed in the call to __kmp_internal_end_thread, actually
5590        gets the gtid through the thread specific data.  Setting it here seems
5591        rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5592        to run smoothly.
5593        todo: get rid of this after we remove the dependence on
5594        __kmp_gtid_get_specific
5595     */
5596     if(gtid >= 0 && KMP_UBER_GTID(gtid))
5597         __kmp_gtid_set_specific( gtid );
5598     #ifdef KMP_TDATA_GTID
5599         __kmp_gtid = gtid;
5600     #endif
5601     __kmp_internal_end_thread( gtid );
5602 }
5603 
5604 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5605 
5606 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5607 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5608 // option in makefile.mk works fine.
5609 
5610 __attribute__(( destructor ))
5611 void
5612 __kmp_internal_end_dtor( void )
5613 {
5614     __kmp_internal_end_atexit();
5615 }
5616 
5617 void
5618 __kmp_internal_end_fini( void )
5619 {
5620     __kmp_internal_end_atexit();
5621 }
5622 
5623 #endif
5624 
5625 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5626 void
5627 __kmp_internal_end_atexit( void )
5628 {
5629     KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5630     /* [Windows]
5631        josh: ideally, we want to completely shutdown the library in this atexit handler, but
5632        stat code that depends on thread specific data for gtid fails because that data becomes
5633        unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5634        instead.  We should eventually remove the dependency on __kmp_get_specific_gtid in the
5635        stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5636 
5637 // TODO: Can some of this comment about GVS be removed?
5638        I suspect that the offending stat code is executed when the calling thread tries to
5639        clean up a dead root thread's data structures, resulting in GVS code trying to close
5640        the GVS structures for that thread, but since the stat code uses
5641        __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5642        cleaning up itself instead of another thread, it gets confused.  This happens because
5643        allowing a thread to unregister and cleanup another thread is a recent modification for
5644        addressing an issue with Maxon Cinema4D.  Based on the current design (20050722), a
5645        thread may end up trying to unregister another thread only if thread death does not
5646        trigger the calling of __kmp_internal_end_thread.  For Linux* OS, there is the thread
5647        specific data destructor function to detect thread death.  For Windows dynamic, there
5648        is DllMain(THREAD_DETACH).  For Windows static, there is nothing.  Thus, the
5649        workaround is applicable only for Windows static stat library.
5650     */
5651     __kmp_internal_end_library( -1 );
5652     #if KMP_OS_WINDOWS
5653         __kmp_close_console();
5654     #endif
5655 }
5656 
5657 static void
5658 __kmp_reap_thread(
5659     kmp_info_t * thread,
5660     int is_root
5661 ) {
5662 
5663     // It is assumed __kmp_forkjoin_lock is acquired.
5664 
5665     int gtid;
5666 
5667     KMP_DEBUG_ASSERT( thread != NULL );
5668 
5669     gtid = thread->th.th_info.ds.ds_gtid;
5670 
5671     if ( ! is_root ) {
5672 
5673         if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5674             /* Assume the threads are at the fork barrier here */
5675             KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5676             /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5677             ANNOTATE_HAPPENS_BEFORE(thread);
5678             kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5679             __kmp_release_64(&flag);
5680         }; // if
5681 
5682         // Terminate OS thread.
5683         __kmp_reap_worker( thread );
5684 
5685         //
5686         // The thread was killed asynchronously.  If it was actively
5687         // spinning in the thread pool, decrement the global count.
5688         //
5689         // There is a small timing hole here - if the worker thread was
5690         // just waking up after sleeping in the pool, had reset it's
5691         // th_active_in_pool flag but not decremented the global counter
5692         // __kmp_thread_pool_active_nth yet, then the global counter
5693         // might not get updated.
5694         //
5695         // Currently, this can only happen as the library is unloaded,
5696         // so there are no harmful side effects.
5697         //
5698         if ( thread->th.th_active_in_pool ) {
5699             thread->th.th_active_in_pool = FALSE;
5700             KMP_TEST_THEN_DEC32(
5701               (kmp_int32 *) &__kmp_thread_pool_active_nth );
5702             KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5703         }
5704 
5705         // Decrement # of [worker] threads in the pool.
5706         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5707         --__kmp_thread_pool_nth;
5708     }; // if
5709 
5710     __kmp_free_implicit_task(thread);
5711 
5712     // Free the fast memory for tasking
5713     #if USE_FAST_MEMORY
5714         __kmp_free_fast_memory( thread );
5715     #endif /* USE_FAST_MEMORY */
5716 
5717     __kmp_suspend_uninitialize_thread( thread );
5718 
5719     KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5720     TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5721 
5722     -- __kmp_all_nth;
5723     // __kmp_nth was decremented when thread is added to the pool.
5724 
5725 #ifdef KMP_ADJUST_BLOCKTIME
5726     /* Adjust blocktime back to user setting or default if necessary */
5727     /* Middle initialization might never have occurred                */
5728     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5729         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5730         if ( __kmp_nth <= __kmp_avail_proc ) {
5731             __kmp_zero_bt = FALSE;
5732         }
5733     }
5734 #endif /* KMP_ADJUST_BLOCKTIME */
5735 
5736     /* free the memory being used */
5737     if( __kmp_env_consistency_check ) {
5738         if ( thread->th.th_cons ) {
5739             __kmp_free_cons_stack( thread->th.th_cons );
5740             thread->th.th_cons = NULL;
5741         }; // if
5742     }
5743 
5744     if ( thread->th.th_pri_common != NULL ) {
5745         __kmp_free( thread->th.th_pri_common );
5746         thread->th.th_pri_common = NULL;
5747     }; // if
5748 
5749     if (thread->th.th_task_state_memo_stack != NULL) {
5750         __kmp_free(thread->th.th_task_state_memo_stack);
5751         thread->th.th_task_state_memo_stack = NULL;
5752     }
5753 
5754     #if KMP_USE_BGET
5755         if ( thread->th.th_local.bget_data != NULL ) {
5756             __kmp_finalize_bget( thread );
5757         }; // if
5758     #endif
5759 
5760 #if KMP_AFFINITY_SUPPORTED
5761     if ( thread->th.th_affin_mask != NULL ) {
5762         KMP_CPU_FREE( thread->th.th_affin_mask );
5763         thread->th.th_affin_mask = NULL;
5764     }; // if
5765 #endif /* KMP_AFFINITY_SUPPORTED */
5766 
5767     __kmp_reap_team( thread->th.th_serial_team );
5768     thread->th.th_serial_team = NULL;
5769     __kmp_free( thread );
5770 
5771     KMP_MB();
5772 
5773 } // __kmp_reap_thread
5774 
5775 static void
5776 __kmp_internal_end(void)
5777 {
5778     int i;
5779 
5780     /* First, unregister the library */
5781     __kmp_unregister_library();
5782 
5783     #if KMP_OS_WINDOWS
5784         /* In Win static library, we can't tell when a root actually dies, so we
5785            reclaim the data structures for any root threads that have died but not
5786            unregistered themselves, in order to shut down cleanly.
5787            In Win dynamic library we also can't tell when a thread dies.
5788         */
5789         __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5790     #endif
5791 
5792     for( i=0 ; i<__kmp_threads_capacity ; i++ )
5793         if( __kmp_root[i] )
5794             if( __kmp_root[i]->r.r_active )
5795                 break;
5796     KMP_MB();       /* Flush all pending memory write invalidates.  */
5797     TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5798 
5799     if ( i < __kmp_threads_capacity ) {
5800 #if KMP_USE_MONITOR
5801         // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5802         KMP_MB();       /* Flush all pending memory write invalidates.  */
5803 
5804         //
5805         // Need to check that monitor was initialized before reaping it.
5806         // If we are called form __kmp_atfork_child (which sets
5807         // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5808         // contain valid data, but it is only valid in the parent process,
5809         // not the child.
5810         //
5811         // New behavior (201008): instead of keying off of the flag
5812         // __kmp_init_parallel, the monitor thread creation is keyed off
5813         // of the new flag __kmp_init_monitor.
5814         //
5815         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5816         if ( TCR_4( __kmp_init_monitor ) ) {
5817             __kmp_reap_monitor( & __kmp_monitor );
5818             TCW_4( __kmp_init_monitor, 0 );
5819         }
5820         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5821         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5822 #endif // KMP_USE_MONITOR
5823     } else {
5824         /* TODO move this to cleanup code */
5825         #ifdef KMP_DEBUG
5826             /* make sure that everything has properly ended */
5827             for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5828                 if( __kmp_root[i] ) {
5829 //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
5830                     KMP_ASSERT( ! __kmp_root[i]->r.r_active );  // TODO: can they be active?
5831                 }
5832             }
5833         #endif
5834 
5835         KMP_MB();
5836 
5837         // Reap the worker threads.
5838         // This is valid for now, but be careful if threads are reaped sooner.
5839         while ( __kmp_thread_pool != NULL ) {    // Loop thru all the thread in the pool.
5840             // Get the next thread from the pool.
5841             kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5842             __kmp_thread_pool = thread->th.th_next_pool;
5843             // Reap it.
5844             thread->th.th_next_pool = NULL;
5845             thread->th.th_in_pool = FALSE;
5846             __kmp_reap_thread( thread, 0 );
5847         }; // while
5848         __kmp_thread_pool_insert_pt = NULL;
5849 
5850         // Reap teams.
5851         while ( __kmp_team_pool != NULL ) {     // Loop thru all the teams in the pool.
5852             // Get the next team from the pool.
5853             kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5854             __kmp_team_pool = team->t.t_next_pool;
5855             // Reap it.
5856             team->t.t_next_pool = NULL;
5857             __kmp_reap_team( team );
5858         }; // while
5859 
5860         __kmp_reap_task_teams( );
5861 
5862         for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5863             // TBD: Add some checking...
5864             // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5865         }
5866 
5867         /* Make sure all threadprivate destructors get run by joining with all worker
5868            threads before resetting this flag */
5869         TCW_SYNC_4(__kmp_init_common, FALSE);
5870 
5871         KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5872         KMP_MB();
5873 
5874 #if KMP_USE_MONITOR
5875         //
5876         // See note above: One of the possible fixes for CQ138434 / CQ140126
5877         //
5878         // FIXME: push both code fragments down and CSE them?
5879         // push them into __kmp_cleanup() ?
5880         //
5881         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5882         if ( TCR_4( __kmp_init_monitor ) ) {
5883             __kmp_reap_monitor( & __kmp_monitor );
5884             TCW_4( __kmp_init_monitor, 0 );
5885         }
5886         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5887         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5888 #endif
5889     } /* else !__kmp_global.t_active */
5890     TCW_4(__kmp_init_gtid, FALSE);
5891     KMP_MB();       /* Flush all pending memory write invalidates.  */
5892 
5893     __kmp_cleanup();
5894 #if OMPT_SUPPORT
5895     ompt_fini();
5896 #endif
5897 }
5898 
5899 void
5900 __kmp_internal_end_library( int gtid_req )
5901 {
5902     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5903     /* this shouldn't be a race condition because __kmp_internal_end() is the
5904      * only place to clear __kmp_serial_init */
5905     /* we'll check this later too, after we get the lock */
5906     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5907     // because the next check will work in any case.
5908     if( __kmp_global.g.g_abort ) {
5909         KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5910         /* TODO abort? */
5911         return;
5912     }
5913     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5914         KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5915         return;
5916     }
5917 
5918 
5919     KMP_MB();       /* Flush all pending memory write invalidates.  */
5920 
5921     /* find out who we are and what we should do */
5922     {
5923         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5924         KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req ));
5925         if( gtid == KMP_GTID_SHUTDOWN ) {
5926             KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5927             return;
5928         } else if( gtid == KMP_GTID_MONITOR ) {
5929             KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5930             return;
5931         } else if( gtid == KMP_GTID_DNE ) {
5932             KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5933             /* we don't know who we are, but we may still shutdown the library */
5934         } else if( KMP_UBER_GTID( gtid )) {
5935             /* unregister ourselves as an uber thread.  gtid is no longer valid */
5936             if( __kmp_root[gtid]->r.r_active ) {
5937                 __kmp_global.g.g_abort = -1;
5938                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5939                 KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5940                 return;
5941             } else {
5942                 KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5943                 __kmp_unregister_root_current_thread( gtid );
5944             }
5945         } else {
5946             /* worker threads may call this function through the atexit handler, if they call exit() */
5947             /* For now, skip the usual subsequent processing and just dump the debug buffer.
5948                TODO: do a thorough shutdown instead
5949             */
5950             #ifdef DUMP_DEBUG_ON_EXIT
5951                 if ( __kmp_debug_buf )
5952                     __kmp_dump_debug_buffer( );
5953             #endif
5954             return;
5955         }
5956     }
5957     /* synchronize the termination process */
5958     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5959 
5960     /* have we already finished */
5961     if( __kmp_global.g.g_abort ) {
5962         KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5963         /* TODO abort? */
5964         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5965         return;
5966     }
5967     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5968         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5969         return;
5970     }
5971 
5972     /* We need this lock to enforce mutex between this reading of
5973        __kmp_threads_capacity and the writing by __kmp_register_root.
5974        Alternatively, we can use a counter of roots that is
5975        atomically updated by __kmp_get_global_thread_id_reg,
5976        __kmp_do_serial_initialize and __kmp_internal_end_*.
5977     */
5978     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5979 
5980     /* now we can safely conduct the actual termination */
5981     __kmp_internal_end();
5982 
5983     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5984     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5985 
5986     KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5987 
5988     #ifdef DUMP_DEBUG_ON_EXIT
5989         if ( __kmp_debug_buf )
5990             __kmp_dump_debug_buffer();
5991     #endif
5992 
5993     #if KMP_OS_WINDOWS
5994         __kmp_close_console();
5995     #endif
5996 
5997     __kmp_fini_allocator();
5998 
5999 } // __kmp_internal_end_library
6000 
6001 void
6002 __kmp_internal_end_thread( int gtid_req )
6003 {
6004     int i;
6005 
6006     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6007     /* this shouldn't be a race condition because __kmp_internal_end() is the
6008      * only place to clear __kmp_serial_init */
6009     /* we'll check this later too, after we get the lock */
6010     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6011     // because the next check will work in any case.
6012     if( __kmp_global.g.g_abort ) {
6013         KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6014         /* TODO abort? */
6015         return;
6016     }
6017     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6018         KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6019         return;
6020     }
6021 
6022     KMP_MB();       /* Flush all pending memory write invalidates.  */
6023 
6024     /* find out who we are and what we should do */
6025     {
6026         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6027         KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req ));
6028         if( gtid == KMP_GTID_SHUTDOWN ) {
6029             KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6030             return;
6031         } else if( gtid == KMP_GTID_MONITOR ) {
6032             KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6033             return;
6034         } else if( gtid == KMP_GTID_DNE ) {
6035             KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6036             return;
6037             /* we don't know who we are */
6038         } else if( KMP_UBER_GTID( gtid )) {
6039         /* unregister ourselves as an uber thread.  gtid is no longer valid */
6040             if( __kmp_root[gtid]->r.r_active ) {
6041                 __kmp_global.g.g_abort = -1;
6042                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6043                 KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6044                 return;
6045             } else {
6046                 KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6047                 __kmp_unregister_root_current_thread( gtid );
6048             }
6049         } else {
6050             /* just a worker thread, let's leave */
6051             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6052 
6053             if ( gtid >= 0 ) {
6054                 __kmp_threads[gtid]->th.th_task_team = NULL;
6055             }
6056 
6057             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6058             return;
6059         }
6060     }
6061     #if defined KMP_DYNAMIC_LIB
6062     // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6063     //     because we will better shutdown later in the library destructor.
6064     //     The reason of this change is performance problem when non-openmp thread
6065     //     in a loop forks and joins many openmp threads. We can save a lot of time
6066     //     keeping worker threads alive until the program shutdown.
6067     // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6068     //     Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6069         KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6070         return;
6071     #endif
6072     /* synchronize the termination process */
6073     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6074 
6075     /* have we already finished */
6076     if( __kmp_global.g.g_abort ) {
6077         KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6078         /* TODO abort? */
6079         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6080         return;
6081     }
6082     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6083         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6084         return;
6085     }
6086 
6087     /* We need this lock to enforce mutex between this reading of
6088        __kmp_threads_capacity and the writing by __kmp_register_root.
6089        Alternatively, we can use a counter of roots that is
6090        atomically updated by __kmp_get_global_thread_id_reg,
6091        __kmp_do_serial_initialize and __kmp_internal_end_*.
6092     */
6093 
6094     /* should we finish the run-time?  are all siblings done? */
6095     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6096 
6097     for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6098         if ( KMP_UBER_GTID( i ) ) {
6099             KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6100             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6101             __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6102             return;
6103         };
6104     }
6105 
6106     /* now we can safely conduct the actual termination */
6107 
6108     __kmp_internal_end();
6109 
6110     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6111     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6112 
6113     KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6114 
6115     #ifdef DUMP_DEBUG_ON_EXIT
6116         if ( __kmp_debug_buf )
6117             __kmp_dump_debug_buffer();
6118     #endif
6119 } // __kmp_internal_end_thread
6120 
6121 // -------------------------------------------------------------------------------------------------
6122 // Library registration stuff.
6123 
6124 static long   __kmp_registration_flag = 0;
6125     // Random value used to indicate library initialization.
6126 static char * __kmp_registration_str  = NULL;
6127     // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6128 
6129 
6130 static inline
6131 char *
6132 __kmp_reg_status_name() {
6133     /*
6134         On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6135         If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6136         the name of registered_lib_env env var can not be found, because the name will contain different pid.
6137     */
6138     return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6139 } // __kmp_reg_status_get
6140 
6141 
6142 void
6143 __kmp_register_library_startup(
6144     void
6145 ) {
6146 
6147     char * name   = __kmp_reg_status_name();  // Name of the environment variable.
6148     int    done   = 0;
6149     union {
6150         double dtime;
6151         long   ltime;
6152     } time;
6153     #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6154         __kmp_initialize_system_tick();
6155     #endif
6156     __kmp_read_system_time( & time.dtime );
6157     __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6158     __kmp_registration_str =
6159         __kmp_str_format(
6160             "%p-%lx-%s",
6161             & __kmp_registration_flag,
6162             __kmp_registration_flag,
6163             KMP_LIBRARY_FILE
6164         );
6165 
6166     KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6167 
6168     while ( ! done ) {
6169 
6170         char * value  = NULL; // Actual value of the environment variable.
6171 
6172         // Set environment variable, but do not overwrite if it is exist.
6173         __kmp_env_set( name, __kmp_registration_str, 0 );
6174         // Check the variable is written.
6175         value = __kmp_env_get( name );
6176         if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6177 
6178             done = 1;    // Ok, environment variable set successfully, exit the loop.
6179 
6180         } else {
6181 
6182             // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6183             // Check whether it alive or dead.
6184             int    neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6185             char * tail          = value;
6186             char * flag_addr_str = NULL;
6187             char * flag_val_str  = NULL;
6188             char const * file_name     = NULL;
6189             __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6190             __kmp_str_split( tail, '-', & flag_val_str,  & tail );
6191             file_name = tail;
6192             if ( tail != NULL ) {
6193                 long * flag_addr = 0;
6194                 long   flag_val  = 0;
6195                 KMP_SSCANF( flag_addr_str, "%p",  & flag_addr );
6196                 KMP_SSCANF( flag_val_str,  "%lx", & flag_val  );
6197                 if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6198                     // First, check whether environment-encoded address is mapped into addr space.
6199                     // If so, dereference it to see if it still has the right value.
6200 
6201                     if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6202                         neighbor = 1;
6203                     } else {
6204                         // If not, then we know the other copy of the library is no longer running.
6205                         neighbor = 2;
6206                     }; // if
6207                 }; // if
6208             }; // if
6209             switch ( neighbor ) {
6210                 case 0 :      // Cannot parse environment variable -- neighbor status unknown.
6211                     // Assume it is the incompatible format of future version of the library.
6212                     // Assume the other library is alive.
6213                     // WARN( ... ); // TODO: Issue a warning.
6214                     file_name = "unknown library";
6215                     // Attention! Falling to the next case. That's intentional.
6216                 case 1 : {    // Neighbor is alive.
6217                     // Check it is allowed.
6218                     char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6219                     if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6220                         // That's not allowed. Issue fatal error.
6221                         __kmp_msg(
6222                             kmp_ms_fatal,
6223                             KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6224                             KMP_HNT( DuplicateLibrary ),
6225                             __kmp_msg_null
6226                         );
6227                     }; // if
6228                     KMP_INTERNAL_FREE( duplicate_ok );
6229                     __kmp_duplicate_library_ok = 1;
6230                     done = 1;    // Exit the loop.
6231                 } break;
6232                 case 2 : {    // Neighbor is dead.
6233                     // Clear the variable and try to register library again.
6234                     __kmp_env_unset( name );
6235                 }  break;
6236                 default : {
6237                     KMP_DEBUG_ASSERT( 0 );
6238                 } break;
6239             }; // switch
6240 
6241         }; // if
6242         KMP_INTERNAL_FREE( (void *) value );
6243 
6244     }; // while
6245     KMP_INTERNAL_FREE( (void *) name );
6246 
6247 } // func __kmp_register_library_startup
6248 
6249 
6250 void
6251 __kmp_unregister_library( void ) {
6252 
6253     char * name  = __kmp_reg_status_name();
6254     char * value = __kmp_env_get( name );
6255 
6256     KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6257     KMP_DEBUG_ASSERT( __kmp_registration_str  != NULL );
6258     if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6259         // Ok, this is our variable. Delete it.
6260         __kmp_env_unset( name );
6261     }; // if
6262 
6263     KMP_INTERNAL_FREE( __kmp_registration_str );
6264     KMP_INTERNAL_FREE( value );
6265     KMP_INTERNAL_FREE( name );
6266 
6267     __kmp_registration_flag = 0;
6268     __kmp_registration_str  = NULL;
6269 
6270 } // __kmp_unregister_library
6271 
6272 
6273 // End of Library registration stuff.
6274 // -------------------------------------------------------------------------------------------------
6275 
6276 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6277 
6278 static void __kmp_check_mic_type()
6279 {
6280     kmp_cpuid_t cpuid_state = {0};
6281     kmp_cpuid_t * cs_p = &cpuid_state;
6282     __kmp_x86_cpuid(1, 0, cs_p);
6283     // We don't support mic1 at the moment
6284     if( (cs_p->eax & 0xff0) == 0xB10 ) {
6285         __kmp_mic_type = mic2;
6286     } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6287         __kmp_mic_type = mic3;
6288     } else {
6289         __kmp_mic_type = non_mic;
6290     }
6291 }
6292 
6293 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6294 
6295 static void
6296 __kmp_do_serial_initialize( void )
6297 {
6298     int i, gtid;
6299     int size;
6300 
6301     KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6302 
6303     KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6304     KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6305     KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6306     KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6307     KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6308 
6309 #if OMPT_SUPPORT
6310     ompt_pre_init();
6311 #endif
6312 
6313     __kmp_validate_locks();
6314 
6315     /* Initialize internal memory allocator */
6316     __kmp_init_allocator();
6317 
6318     /* Register the library startup via an environment variable
6319        and check to see whether another copy of the library is already
6320        registered. */
6321 
6322     __kmp_register_library_startup( );
6323 
6324     /* TODO reinitialization of library */
6325     if( TCR_4(__kmp_global.g.g_done) ) {
6326        KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6327     }
6328 
6329     __kmp_global.g.g_abort = 0;
6330     TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6331 
6332     /* initialize the locks */
6333 #if KMP_USE_ADAPTIVE_LOCKS
6334 #if KMP_DEBUG_ADAPTIVE_LOCKS
6335     __kmp_init_speculative_stats();
6336 #endif
6337 #endif
6338 #if KMP_STATS_ENABLED
6339     __kmp_stats_init();
6340 #endif
6341     __kmp_init_lock( & __kmp_global_lock     );
6342     __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6343     __kmp_init_lock( & __kmp_debug_lock      );
6344     __kmp_init_atomic_lock( & __kmp_atomic_lock     );
6345     __kmp_init_atomic_lock( & __kmp_atomic_lock_1i  );
6346     __kmp_init_atomic_lock( & __kmp_atomic_lock_2i  );
6347     __kmp_init_atomic_lock( & __kmp_atomic_lock_4i  );
6348     __kmp_init_atomic_lock( & __kmp_atomic_lock_4r  );
6349     __kmp_init_atomic_lock( & __kmp_atomic_lock_8i  );
6350     __kmp_init_atomic_lock( & __kmp_atomic_lock_8r  );
6351     __kmp_init_atomic_lock( & __kmp_atomic_lock_8c  );
6352     __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6353     __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6354     __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6355     __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6356     __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6357     __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock  );
6358     __kmp_init_bootstrap_lock( & __kmp_exit_lock      );
6359 #if KMP_USE_MONITOR
6360     __kmp_init_bootstrap_lock( & __kmp_monitor_lock   );
6361 #endif
6362     __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6363 
6364     /* conduct initialization and initial setup of configuration */
6365 
6366     __kmp_runtime_initialize();
6367 
6368 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6369     __kmp_check_mic_type();
6370 #endif
6371 
6372     // Some global variable initialization moved here from kmp_env_initialize()
6373 #ifdef KMP_DEBUG
6374     kmp_diag = 0;
6375 #endif
6376     __kmp_abort_delay = 0;
6377 
6378     // From __kmp_init_dflt_team_nth()
6379     /* assume the entire machine will be used */
6380     __kmp_dflt_team_nth_ub = __kmp_xproc;
6381     if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6382         __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6383     }
6384     if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6385         __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6386     }
6387     __kmp_max_nth = __kmp_sys_max_nth;
6388 
6389     // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6390     __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6391 #if KMP_USE_MONITOR
6392     __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6393     __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6394 #endif
6395     // From "KMP_LIBRARY" part of __kmp_env_initialize()
6396     __kmp_library = library_throughput;
6397     // From KMP_SCHEDULE initialization
6398     __kmp_static = kmp_sch_static_balanced;
6399     // AC: do not use analytical here, because it is non-monotonous
6400     //__kmp_guided = kmp_sch_guided_iterative_chunked;
6401     //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6402     // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6403     // control parts
6404     #if KMP_FAST_REDUCTION_BARRIER
6405         #define kmp_reduction_barrier_gather_bb ((int)1)
6406         #define kmp_reduction_barrier_release_bb ((int)1)
6407         #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6408         #define kmp_reduction_barrier_release_pat bp_hyper_bar
6409     #endif // KMP_FAST_REDUCTION_BARRIER
6410     for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6411         __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6412         __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6413         __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6414         __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6415         #if KMP_FAST_REDUCTION_BARRIER
6416         if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6417             __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6418             __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6419             __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6420             __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6421         }
6422         #endif // KMP_FAST_REDUCTION_BARRIER
6423     }
6424     #if KMP_FAST_REDUCTION_BARRIER
6425         #undef kmp_reduction_barrier_release_pat
6426         #undef kmp_reduction_barrier_gather_pat
6427         #undef kmp_reduction_barrier_release_bb
6428         #undef kmp_reduction_barrier_gather_bb
6429     #endif // KMP_FAST_REDUCTION_BARRIER
6430 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6431     if (__kmp_mic_type == mic2) { // KNC
6432         // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6433         __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3;  // plain gather
6434         __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1;  // forkjoin release
6435         __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6436         __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6437     }
6438 #if KMP_FAST_REDUCTION_BARRIER
6439     if (__kmp_mic_type == mic2) { // KNC
6440         __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6441         __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6442     }
6443 #endif
6444 #endif
6445 
6446     // From KMP_CHECKS initialization
6447 #ifdef KMP_DEBUG
6448     __kmp_env_checks = TRUE;   /* development versions have the extra checks */
6449 #else
6450     __kmp_env_checks = FALSE;  /* port versions do not have the extra checks */
6451 #endif
6452 
6453     // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6454     __kmp_foreign_tp = TRUE;
6455 
6456     __kmp_global.g.g_dynamic = FALSE;
6457     __kmp_global.g.g_dynamic_mode = dynamic_default;
6458 
6459     __kmp_env_initialize( NULL );
6460 
6461     // Print all messages in message catalog for testing purposes.
6462     #ifdef KMP_DEBUG
6463         char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6464         if ( __kmp_str_match_true( val ) ) {
6465             kmp_str_buf_t buffer;
6466             __kmp_str_buf_init( & buffer );
6467             __kmp_i18n_dump_catalog( & buffer );
6468             __kmp_printf( "%s", buffer.str );
6469             __kmp_str_buf_free( & buffer );
6470         }; // if
6471         __kmp_env_free( & val );
6472     #endif
6473 
6474     __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6475     // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6476     __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6477 
6478     // If the library is shut down properly, both pools must be NULL. Just in case, set them
6479     // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6480     KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6481     KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6482     KMP_DEBUG_ASSERT( __kmp_team_pool   == NULL );
6483     __kmp_thread_pool = NULL;
6484     __kmp_thread_pool_insert_pt = NULL;
6485     __kmp_team_pool   = NULL;
6486 
6487     /* Allocate all of the variable sized records */
6488     /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6489     /* Since allocation is cache-aligned, just add extra padding at the end */
6490     size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6491     __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6492     __kmp_root    = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6493 
6494     /* init thread counts */
6495     KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6496     KMP_DEBUG_ASSERT( __kmp_nth == 0 );     // something was wrong in termination.
6497     __kmp_all_nth = 0;
6498     __kmp_nth     = 0;
6499 
6500     /* setup the uber master thread and hierarchy */
6501     gtid = __kmp_register_root( TRUE );
6502     KA_TRACE( 10, ("__kmp_do_serial_initialize  T#%d\n", gtid ));
6503     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6504     KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6505 
6506     KMP_MB();       /* Flush all pending memory write invalidates.  */
6507 
6508     __kmp_common_initialize();
6509 
6510     #if KMP_OS_UNIX
6511         /* invoke the child fork handler */
6512         __kmp_register_atfork();
6513     #endif
6514 
6515     #if ! defined KMP_DYNAMIC_LIB
6516         {
6517             /* Invoke the exit handler when the program finishes, only for static library.
6518                For dynamic library, we already have _fini and DllMain.
6519              */
6520             int rc = atexit( __kmp_internal_end_atexit );
6521             if ( rc != 0 ) {
6522                 __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6523             }; // if
6524         }
6525     #endif
6526 
6527     #if KMP_HANDLE_SIGNALS
6528         #if KMP_OS_UNIX
6529             /* NOTE: make sure that this is called before the user installs
6530              *          their own signal handlers so that the user handlers
6531              *          are called first.  this way they can return false,
6532              *          not call our handler, avoid terminating the library,
6533              *          and continue execution where they left off. */
6534             __kmp_install_signals( FALSE );
6535         #endif /* KMP_OS_UNIX */
6536         #if KMP_OS_WINDOWS
6537             __kmp_install_signals( TRUE );
6538         #endif /* KMP_OS_WINDOWS */
6539     #endif
6540 
6541     /* we have finished the serial initialization */
6542     __kmp_init_counter ++;
6543 
6544     __kmp_init_serial = TRUE;
6545 
6546     if (__kmp_settings) {
6547         __kmp_env_print();
6548     }
6549 
6550 #if OMP_40_ENABLED
6551     if (__kmp_display_env || __kmp_display_env_verbose) {
6552         __kmp_env_print_2();
6553     }
6554 #endif // OMP_40_ENABLED
6555 
6556 #if OMPT_SUPPORT
6557     ompt_post_init();
6558 #endif
6559 
6560     KMP_MB();
6561 
6562     KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6563 }
6564 
6565 void
6566 __kmp_serial_initialize( void )
6567 {
6568     if ( __kmp_init_serial ) {
6569         return;
6570     }
6571     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6572     if ( __kmp_init_serial ) {
6573         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6574         return;
6575     }
6576     __kmp_do_serial_initialize();
6577     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6578 }
6579 
6580 static void
6581 __kmp_do_middle_initialize( void )
6582 {
6583     int i, j;
6584     int prev_dflt_team_nth;
6585 
6586     if( !__kmp_init_serial ) {
6587         __kmp_do_serial_initialize();
6588     }
6589 
6590     KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6591 
6592     //
6593     // Save the previous value for the __kmp_dflt_team_nth so that
6594     // we can avoid some reinitialization if it hasn't changed.
6595     //
6596     prev_dflt_team_nth = __kmp_dflt_team_nth;
6597 
6598 #if KMP_AFFINITY_SUPPORTED
6599     //
6600     // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6601     // number of cores on the machine.
6602     //
6603     __kmp_affinity_initialize();
6604 
6605     //
6606     // Run through the __kmp_threads array and set the affinity mask
6607     // for each root thread that is currently registered with the RTL.
6608     //
6609     for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6610         if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6611             __kmp_affinity_set_init_mask( i, TRUE );
6612         }
6613     }
6614 #endif /* KMP_AFFINITY_SUPPORTED */
6615 
6616     KMP_ASSERT( __kmp_xproc > 0 );
6617     if ( __kmp_avail_proc == 0 ) {
6618         __kmp_avail_proc = __kmp_xproc;
6619     }
6620 
6621     // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6622     j = 0;
6623     while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6624         __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6625         j++;
6626     }
6627 
6628     if ( __kmp_dflt_team_nth == 0 ) {
6629 #ifdef KMP_DFLT_NTH_CORES
6630         //
6631         // Default #threads = #cores
6632         //
6633         __kmp_dflt_team_nth = __kmp_ncores;
6634         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6635           __kmp_dflt_team_nth ) );
6636 #else
6637         //
6638         // Default #threads = #available OS procs
6639         //
6640         __kmp_dflt_team_nth = __kmp_avail_proc;
6641         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6642           __kmp_dflt_team_nth ) );
6643 #endif /* KMP_DFLT_NTH_CORES */
6644     }
6645 
6646     if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6647         __kmp_dflt_team_nth = KMP_MIN_NTH;
6648     }
6649     if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6650         __kmp_dflt_team_nth = __kmp_sys_max_nth;
6651     }
6652 
6653     //
6654     // There's no harm in continuing if the following check fails,
6655     // but it indicates an error in the previous logic.
6656     //
6657     KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6658 
6659     if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6660         //
6661         // Run through the __kmp_threads array and set the num threads icv
6662         // for each root thread that is currently registered with the RTL
6663         // (which has not already explicitly set its nthreads-var with a
6664         // call to omp_set_num_threads()).
6665         //
6666         for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6667             kmp_info_t *thread = __kmp_threads[ i ];
6668             if ( thread == NULL ) continue;
6669             if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6670 
6671             set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6672         }
6673     }
6674     KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6675       __kmp_dflt_team_nth) );
6676 
6677 #ifdef KMP_ADJUST_BLOCKTIME
6678     /* Adjust blocktime to zero if necessary */
6679     /* now that __kmp_avail_proc is set      */
6680     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6681         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6682         if ( __kmp_nth > __kmp_avail_proc ) {
6683             __kmp_zero_bt = TRUE;
6684         }
6685     }
6686 #endif /* KMP_ADJUST_BLOCKTIME */
6687 
6688     /* we have finished middle initialization */
6689     TCW_SYNC_4(__kmp_init_middle, TRUE);
6690 
6691     KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6692 }
6693 
6694 void
6695 __kmp_middle_initialize( void )
6696 {
6697     if ( __kmp_init_middle ) {
6698         return;
6699     }
6700     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6701     if ( __kmp_init_middle ) {
6702         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6703         return;
6704     }
6705     __kmp_do_middle_initialize();
6706     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6707 }
6708 
6709 void
6710 __kmp_parallel_initialize( void )
6711 {
6712     int gtid = __kmp_entry_gtid();      // this might be a new root
6713 
6714     /* synchronize parallel initialization (for sibling) */
6715     if( TCR_4(__kmp_init_parallel) ) return;
6716     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6717     if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6718 
6719     /* TODO reinitialization after we have already shut down */
6720     if( TCR_4(__kmp_global.g.g_done) ) {
6721         KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6722         __kmp_infinite_loop();
6723     }
6724 
6725     /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6726            would cause a deadlock.  So we call __kmp_do_serial_initialize directly.
6727     */
6728     if( !__kmp_init_middle ) {
6729         __kmp_do_middle_initialize();
6730     }
6731 
6732     /* begin initialization */
6733     KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6734     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6735 
6736 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6737     //
6738     // Save the FP control regs.
6739     // Worker threads will set theirs to these values at thread startup.
6740     //
6741     __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6742     __kmp_store_mxcsr( &__kmp_init_mxcsr );
6743     __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6744 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6745 
6746 #if KMP_OS_UNIX
6747 # if KMP_HANDLE_SIGNALS
6748     /*  must be after __kmp_serial_initialize  */
6749     __kmp_install_signals( TRUE );
6750 # endif
6751 #endif
6752 
6753     __kmp_suspend_initialize();
6754 
6755 #if defined(USE_LOAD_BALANCE)
6756     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6757         __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6758     }
6759 #else
6760     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6761         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6762     }
6763 #endif
6764 
6765     if ( __kmp_version ) {
6766         __kmp_print_version_2();
6767     }
6768 
6769     /* we have finished parallel initialization */
6770     TCW_SYNC_4(__kmp_init_parallel, TRUE);
6771 
6772     KMP_MB();
6773     KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6774 
6775     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6776 }
6777 
6778 
6779 /* ------------------------------------------------------------------------ */
6780 
6781 void
6782 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6783   kmp_team_t *team )
6784 {
6785     kmp_disp_t *dispatch;
6786 
6787     KMP_MB();
6788 
6789     /* none of the threads have encountered any constructs, yet. */
6790     this_thr->th.th_local.this_construct = 0;
6791 #if KMP_CACHE_MANAGE
6792     KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6793 #endif /* KMP_CACHE_MANAGE */
6794     dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6795     KMP_DEBUG_ASSERT( dispatch );
6796     KMP_DEBUG_ASSERT( team->t.t_dispatch );
6797     //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6798 
6799     dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
6800 #if OMP_45_ENABLED
6801     dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6802 #endif
6803     if( __kmp_env_consistency_check )
6804         __kmp_push_parallel( gtid, team->t.t_ident );
6805 
6806     KMP_MB();       /* Flush all pending memory write invalidates.  */
6807 }
6808 
6809 void
6810 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6811   kmp_team_t *team )
6812 {
6813     if( __kmp_env_consistency_check )
6814         __kmp_pop_parallel( gtid, team->t.t_ident );
6815 
6816     __kmp_finish_implicit_task(this_thr);
6817 }
6818 
6819 int
6820 __kmp_invoke_task_func( int gtid )
6821 {
6822     int          rc;
6823     int          tid      = __kmp_tid_from_gtid( gtid );
6824     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6825     kmp_team_t  *team     = this_thr->th.th_team;
6826 
6827     __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6828 #if USE_ITT_BUILD
6829     if ( __itt_stack_caller_create_ptr ) {
6830         __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6831     }
6832 #endif /* USE_ITT_BUILD */
6833 #if INCLUDE_SSC_MARKS
6834     SSC_MARK_INVOKING();
6835 #endif
6836 
6837 #if OMPT_SUPPORT
6838     void *dummy;
6839     void **exit_runtime_p;
6840     ompt_task_id_t my_task_id;
6841     ompt_parallel_id_t my_parallel_id;
6842 
6843     if (ompt_enabled) {
6844         exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6845             ompt_task_info.frame.exit_runtime_frame);
6846     } else {
6847         exit_runtime_p = &dummy;
6848     }
6849 
6850 #if OMPT_TRACE
6851     my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6852     my_parallel_id = team->t.ompt_team_info.parallel_id;
6853     if (ompt_enabled &&
6854         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6855         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6856             my_parallel_id, my_task_id);
6857     }
6858 #endif
6859 #endif
6860 
6861     {
6862         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6863         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6864         rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6865                                      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6866 #if OMPT_SUPPORT
6867                                      , exit_runtime_p
6868 #endif
6869                                      );
6870 #if OMPT_SUPPORT
6871         *exit_runtime_p = NULL;
6872 #endif
6873     }
6874 
6875 #if USE_ITT_BUILD
6876     if ( __itt_stack_caller_create_ptr ) {
6877         __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6878     }
6879 #endif /* USE_ITT_BUILD */
6880     __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6881 
6882     return rc;
6883 }
6884 
6885 #if OMP_40_ENABLED
6886 void
6887 __kmp_teams_master( int gtid )
6888 {
6889     // This routine is called by all master threads in teams construct
6890     kmp_info_t *thr = __kmp_threads[ gtid ];
6891     kmp_team_t *team = thr->th.th_team;
6892     ident_t     *loc =  team->t.t_ident;
6893     thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6894     KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6895     KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6896     KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6897                    gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6898     // Launch league of teams now, but not let workers execute
6899     // (they hang on fork barrier until next parallel)
6900 #if INCLUDE_SSC_MARKS
6901     SSC_MARK_FORKING();
6902 #endif
6903     __kmp_fork_call( loc, gtid, fork_context_intel,
6904             team->t.t_argc,
6905 #if OMPT_SUPPORT
6906             (void *)thr->th.th_teams_microtask,      // "unwrapped" task
6907 #endif
6908             (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6909             VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6910             NULL );
6911 #if INCLUDE_SSC_MARKS
6912     SSC_MARK_JOINING();
6913 #endif
6914 
6915     // AC: last parameter "1" eliminates join barrier which won't work because
6916     // worker threads are in a fork barrier waiting for more parallel regions
6917     __kmp_join_call( loc, gtid
6918 #if OMPT_SUPPORT
6919         , fork_context_intel
6920 #endif
6921         , 1 );
6922 }
6923 
6924 int
6925 __kmp_invoke_teams_master( int gtid )
6926 {
6927     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6928     kmp_team_t  *team     = this_thr->th.th_team;
6929     #if KMP_DEBUG
6930     if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6931         KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6932     #endif
6933     __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6934     __kmp_teams_master( gtid );
6935     __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6936     return 1;
6937 }
6938 #endif /* OMP_40_ENABLED */
6939 
6940 /* this sets the requested number of threads for the next parallel region
6941  * encountered by this team */
6942 /* since this should be enclosed in the forkjoin critical section it
6943  * should avoid race conditions with assymmetrical nested parallelism */
6944 
6945 void
6946 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6947 {
6948     kmp_info_t *thr = __kmp_threads[gtid];
6949 
6950     if( num_threads > 0 )
6951         thr->th.th_set_nproc = num_threads;
6952 }
6953 
6954 #if OMP_40_ENABLED
6955 
6956 /* this sets the requested number of teams for the teams region and/or
6957  * the number of threads for the next parallel region encountered  */
6958 void
6959 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6960 {
6961     kmp_info_t *thr = __kmp_threads[gtid];
6962     KMP_DEBUG_ASSERT(num_teams >= 0);
6963     KMP_DEBUG_ASSERT(num_threads >= 0);
6964 
6965     if( num_teams == 0 )
6966         num_teams = 1;    // default number of teams is 1.
6967     if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6968         if ( !__kmp_reserve_warn ) {
6969             __kmp_reserve_warn = 1;
6970             __kmp_msg(
6971                 kmp_ms_warning,
6972                 KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
6973                 KMP_HNT( Unset_ALL_THREADS ),
6974                 __kmp_msg_null
6975             );
6976         }
6977         num_teams = __kmp_max_nth;
6978     }
6979     // Set number of teams (number of threads in the outer "parallel" of the teams)
6980     thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6981 
6982     // Remember the number of threads for inner parallel regions
6983     if( num_threads == 0 ) {
6984         if( !TCR_4(__kmp_init_middle) )
6985             __kmp_middle_initialize();  // get __kmp_avail_proc calculated
6986         num_threads = __kmp_avail_proc / num_teams;
6987         if( num_teams * num_threads > __kmp_max_nth ) {
6988             // adjust num_threads w/o warning as it is not user setting
6989             num_threads = __kmp_max_nth / num_teams;
6990         }
6991     } else {
6992         if( num_teams * num_threads > __kmp_max_nth ) {
6993             int new_threads = __kmp_max_nth / num_teams;
6994             if ( !__kmp_reserve_warn ) { // user asked for too many threads
6995                 __kmp_reserve_warn = 1;  // that conflicts with OMP_THREAD_LIMIT
6996                 __kmp_msg(
6997                     kmp_ms_warning,
6998                     KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
6999                     KMP_HNT( Unset_ALL_THREADS ),
7000                     __kmp_msg_null
7001                 );
7002             }
7003             num_threads = new_threads;
7004         }
7005     }
7006     thr->th.th_teams_size.nth = num_threads;
7007 }
7008 
7009 
7010 //
7011 // Set the proc_bind var to use in the following parallel region.
7012 //
7013 void
7014 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7015 {
7016     kmp_info_t *thr = __kmp_threads[gtid];
7017     thr->th.th_set_proc_bind = proc_bind;
7018 }
7019 
7020 #endif /* OMP_40_ENABLED */
7021 
7022 /* Launch the worker threads into the microtask. */
7023 
7024 void
7025 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7026 {
7027     kmp_info_t *this_thr = __kmp_threads[gtid];
7028 
7029 #ifdef KMP_DEBUG
7030     int f;
7031 #endif /* KMP_DEBUG */
7032 
7033     KMP_DEBUG_ASSERT( team );
7034     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7035     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7036     KMP_MB();       /* Flush all pending memory write invalidates.  */
7037 
7038     team->t.t_construct = 0;          /* no single directives seen yet */
7039     team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7040 
7041     /* Reset the identifiers on the dispatch buffer */
7042     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7043     if ( team->t.t_max_nproc > 1 ) {
7044         int i;
7045         for (i = 0; i <  __kmp_dispatch_num_buffers; ++i) {
7046             team->t.t_disp_buffer[ i ].buffer_index = i;
7047 #if OMP_45_ENABLED
7048             team->t.t_disp_buffer[i].doacross_buf_idx = i;
7049 #endif
7050         }
7051     } else {
7052         team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7053 #if OMP_45_ENABLED
7054         team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7055 #endif
7056     }
7057 
7058     KMP_MB();       /* Flush all pending memory write invalidates.  */
7059     KMP_ASSERT( this_thr->th.th_team  ==  team );
7060 
7061 #ifdef KMP_DEBUG
7062     for( f=0 ; f<team->t.t_nproc ; f++ ) {
7063         KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7064                           team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7065     }
7066 #endif /* KMP_DEBUG */
7067 
7068     /* release the worker threads so they may begin working */
7069     __kmp_fork_barrier( gtid, 0 );
7070 }
7071 
7072 
7073 void
7074 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7075 {
7076     kmp_info_t *this_thr = __kmp_threads[gtid];
7077 
7078     KMP_DEBUG_ASSERT( team );
7079     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7080     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7081     KMP_MB();       /* Flush all pending memory write invalidates.  */
7082 
7083     /* Join barrier after fork */
7084 
7085 #ifdef KMP_DEBUG
7086     if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7087         __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7088         __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7089                      gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7090         __kmp_print_structure();
7091     }
7092     KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7093                      __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7094 #endif /* KMP_DEBUG */
7095 
7096     __kmp_join_barrier( gtid );  /* wait for everyone */
7097 
7098     KMP_MB();       /* Flush all pending memory write invalidates.  */
7099     KMP_ASSERT( this_thr->th.th_team  ==  team );
7100 }
7101 
7102 
7103 /* ------------------------------------------------------------------------ */
7104 /* ------------------------------------------------------------------------ */
7105 
7106 #ifdef USE_LOAD_BALANCE
7107 
7108 //
7109 // Return the worker threads actively spinning in the hot team, if we
7110 // are at the outermost level of parallelism.  Otherwise, return 0.
7111 //
7112 static int
7113 __kmp_active_hot_team_nproc( kmp_root_t *root )
7114 {
7115     int i;
7116     int retval;
7117     kmp_team_t *hot_team;
7118 
7119     if ( root->r.r_active ) {
7120         return 0;
7121     }
7122     hot_team = root->r.r_hot_team;
7123     if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7124         return hot_team->t.t_nproc - 1;  // Don't count master thread
7125     }
7126 
7127     //
7128     // Skip the master thread - it is accounted for elsewhere.
7129     //
7130     retval = 0;
7131     for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7132         if ( hot_team->t.t_threads[i]->th.th_active ) {
7133             retval++;
7134         }
7135     }
7136     return retval;
7137 }
7138 
7139 //
7140 // Perform an automatic adjustment to the number of
7141 // threads used by the next parallel region.
7142 //
7143 static int
7144 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7145 {
7146     int retval;
7147     int pool_active;
7148     int hot_team_active;
7149     int team_curr_active;
7150     int system_active;
7151 
7152     KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7153                 root, set_nproc ) );
7154     KMP_DEBUG_ASSERT( root );
7155     KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7156     KMP_DEBUG_ASSERT( set_nproc > 1 );
7157 
7158     if ( set_nproc == 1) {
7159         KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7160         return 1;
7161     }
7162 
7163     //
7164     // Threads that are active in the thread pool, active in the hot team
7165     // for this particular root (if we are at the outer par level), and
7166     // the currently executing thread (to become the master) are available
7167     // to add to the new team, but are currently contributing to the system
7168     // load, and must be accounted for.
7169     //
7170     pool_active = TCR_4(__kmp_thread_pool_active_nth);
7171     hot_team_active = __kmp_active_hot_team_nproc( root );
7172     team_curr_active = pool_active + hot_team_active + 1;
7173 
7174     //
7175     // Check the system load.
7176     //
7177     system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7178     KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7179       system_active, pool_active, hot_team_active ) );
7180 
7181     if ( system_active < 0 ) {
7182         //
7183         // There was an error reading the necessary info from /proc,
7184         // so use the thread limit algorithm instead.  Once we set
7185         // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7186         // we shouldn't wind up getting back here.
7187         //
7188         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7189         KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7190 
7191         //
7192         // Make this call behave like the thread limit algorithm.
7193         //
7194         retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7195           : root->r.r_hot_team->t.t_nproc);
7196         if ( retval > set_nproc ) {
7197             retval = set_nproc;
7198         }
7199         if ( retval < KMP_MIN_NTH ) {
7200             retval = KMP_MIN_NTH;
7201         }
7202 
7203         KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7204         return retval;
7205     }
7206 
7207     //
7208     // There is a slight delay in the load balance algorithm in detecting
7209     // new running procs.  The real system load at this instant should be
7210     // at least as large as the #active omp thread that are available to
7211     // add to the team.
7212     //
7213     if ( system_active < team_curr_active ) {
7214         system_active = team_curr_active;
7215     }
7216     retval = __kmp_avail_proc - system_active + team_curr_active;
7217     if ( retval > set_nproc ) {
7218         retval = set_nproc;
7219     }
7220     if ( retval < KMP_MIN_NTH ) {
7221         retval = KMP_MIN_NTH;
7222     }
7223 
7224     KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7225     return retval;
7226 } // __kmp_load_balance_nproc()
7227 
7228 #endif /* USE_LOAD_BALANCE */
7229 
7230 /* ------------------------------------------------------------------------ */
7231 /* ------------------------------------------------------------------------ */
7232 
7233 /* NOTE: this is called with the __kmp_init_lock held */
7234 void
7235 __kmp_cleanup( void )
7236 {
7237     int f;
7238 
7239     KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7240 
7241     if (TCR_4(__kmp_init_parallel)) {
7242 #if KMP_HANDLE_SIGNALS
7243         __kmp_remove_signals();
7244 #endif
7245         TCW_4(__kmp_init_parallel, FALSE);
7246     }
7247 
7248     if (TCR_4(__kmp_init_middle)) {
7249 #if KMP_AFFINITY_SUPPORTED
7250         __kmp_affinity_uninitialize();
7251 #endif /* KMP_AFFINITY_SUPPORTED */
7252         __kmp_cleanup_hierarchy();
7253         TCW_4(__kmp_init_middle, FALSE);
7254     }
7255 
7256     KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7257 
7258     if (__kmp_init_serial) {
7259         __kmp_runtime_destroy();
7260         __kmp_init_serial = FALSE;
7261     }
7262 
7263     for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7264         if ( __kmp_root[ f ] != NULL ) {
7265             __kmp_free( __kmp_root[ f ] );
7266             __kmp_root[ f ] = NULL;
7267         }
7268     }
7269     __kmp_free( __kmp_threads );
7270     // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7271     // freeing __kmp_root.
7272     __kmp_threads = NULL;
7273     __kmp_root    = NULL;
7274     __kmp_threads_capacity = 0;
7275 
7276 #if KMP_USE_DYNAMIC_LOCK
7277     __kmp_cleanup_indirect_user_locks();
7278 #else
7279     __kmp_cleanup_user_locks();
7280 #endif
7281 
7282     #if KMP_AFFINITY_SUPPORTED
7283         KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7284         __kmp_cpuinfo_file = NULL;
7285     #endif /* KMP_AFFINITY_SUPPORTED */
7286 
7287    #if KMP_USE_ADAPTIVE_LOCKS
7288    #if KMP_DEBUG_ADAPTIVE_LOCKS
7289        __kmp_print_speculative_stats();
7290    #endif
7291    #endif
7292     KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7293     __kmp_nested_nth.nth = NULL;
7294     __kmp_nested_nth.size = 0;
7295     __kmp_nested_nth.used = 0;
7296 
7297     __kmp_i18n_catclose();
7298 
7299 #if KMP_STATS_ENABLED
7300     __kmp_stats_fini();
7301 #endif
7302 
7303     KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7304 }
7305 
7306 /* ------------------------------------------------------------------------ */
7307 /* ------------------------------------------------------------------------ */
7308 
7309 int
7310 __kmp_ignore_mppbeg( void )
7311 {
7312     char *env;
7313 
7314     if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7315         if (__kmp_str_match_false( env ))
7316             return FALSE;
7317     }
7318     // By default __kmpc_begin() is no-op.
7319     return TRUE;
7320 }
7321 
7322 int
7323 __kmp_ignore_mppend( void )
7324 {
7325     char *env;
7326 
7327     if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7328         if (__kmp_str_match_false( env ))
7329             return FALSE;
7330     }
7331     // By default __kmpc_end() is no-op.
7332     return TRUE;
7333 }
7334 
7335 void
7336 __kmp_internal_begin( void )
7337 {
7338     int gtid;
7339     kmp_root_t *root;
7340 
7341     /* this is a very important step as it will register new sibling threads
7342      * and assign these new uber threads a new gtid */
7343     gtid = __kmp_entry_gtid();
7344     root = __kmp_threads[ gtid ]->th.th_root;
7345     KMP_ASSERT( KMP_UBER_GTID( gtid ));
7346 
7347     if( root->r.r_begin ) return;
7348     __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7349     if( root->r.r_begin ) {
7350         __kmp_release_lock( & root->r.r_begin_lock, gtid );
7351         return;
7352     }
7353 
7354     root->r.r_begin = TRUE;
7355 
7356     __kmp_release_lock( & root->r.r_begin_lock, gtid );
7357 }
7358 
7359 
7360 /* ------------------------------------------------------------------------ */
7361 /* ------------------------------------------------------------------------ */
7362 
7363 void
7364 __kmp_user_set_library (enum library_type arg)
7365 {
7366     int gtid;
7367     kmp_root_t *root;
7368     kmp_info_t *thread;
7369 
7370     /* first, make sure we are initialized so we can get our gtid */
7371 
7372     gtid = __kmp_entry_gtid();
7373     thread = __kmp_threads[ gtid ];
7374 
7375     root = thread->th.th_root;
7376 
7377     KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7378     if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7379         KMP_WARNING( SetLibraryIncorrectCall );
7380         return;
7381     }
7382 
7383     switch ( arg ) {
7384     case library_serial :
7385         thread->th.th_set_nproc = 0;
7386         set__nproc( thread, 1 );
7387         break;
7388     case library_turnaround :
7389         thread->th.th_set_nproc = 0;
7390         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7391         break;
7392     case library_throughput :
7393         thread->th.th_set_nproc = 0;
7394         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7395         break;
7396     default:
7397         KMP_FATAL( UnknownLibraryType, arg );
7398     }
7399 
7400     __kmp_aux_set_library ( arg );
7401 }
7402 
7403 void
7404 __kmp_aux_set_stacksize( size_t arg )
7405 {
7406     if (! __kmp_init_serial)
7407         __kmp_serial_initialize();
7408 
7409 #if KMP_OS_DARWIN
7410     if (arg & (0x1000 - 1)) {
7411         arg &= ~(0x1000 - 1);
7412         if(arg + 0x1000) /* check for overflow if we round up */
7413             arg += 0x1000;
7414     }
7415 #endif
7416     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7417 
7418     /* only change the default stacksize before the first parallel region */
7419     if (! TCR_4(__kmp_init_parallel)) {
7420         size_t value = arg;       /* argument is in bytes */
7421 
7422         if (value < __kmp_sys_min_stksize )
7423             value = __kmp_sys_min_stksize ;
7424         else if (value > KMP_MAX_STKSIZE)
7425             value = KMP_MAX_STKSIZE;
7426 
7427         __kmp_stksize = value;
7428 
7429         __kmp_env_stksize = TRUE;    /* was KMP_STACKSIZE specified? */
7430     }
7431 
7432     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7433 }
7434 
7435 /* set the behaviour of the runtime library */
7436 /* TODO this can cause some odd behaviour with sibling parallelism... */
7437 void
7438 __kmp_aux_set_library (enum library_type arg)
7439 {
7440     __kmp_library = arg;
7441 
7442     switch ( __kmp_library ) {
7443     case library_serial :
7444         {
7445             KMP_INFORM( LibraryIsSerial );
7446             (void) __kmp_change_library( TRUE );
7447         }
7448         break;
7449     case library_turnaround :
7450         (void) __kmp_change_library( TRUE );
7451         break;
7452     case library_throughput :
7453         (void) __kmp_change_library( FALSE );
7454         break;
7455     default:
7456         KMP_FATAL( UnknownLibraryType, arg );
7457     }
7458 }
7459 
7460 /* ------------------------------------------------------------------------ */
7461 /* ------------------------------------------------------------------------ */
7462 
7463 void
7464 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7465 {
7466     int blocktime = arg;        /* argument is in milliseconds */
7467 #if KMP_USE_MONITOR
7468     int bt_intervals;
7469 #endif
7470     int bt_set;
7471 
7472     __kmp_save_internal_controls( thread );
7473 
7474     /* Normalize and set blocktime for the teams */
7475     if (blocktime < KMP_MIN_BLOCKTIME)
7476         blocktime = KMP_MIN_BLOCKTIME;
7477     else if (blocktime > KMP_MAX_BLOCKTIME)
7478         blocktime = KMP_MAX_BLOCKTIME;
7479 
7480     set__blocktime_team( thread->th.th_team, tid, blocktime );
7481     set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7482 
7483 #if KMP_USE_MONITOR
7484     /* Calculate and set blocktime intervals for the teams */
7485     bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7486 
7487     set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7488     set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7489 #endif
7490 
7491     /* Set whether blocktime has been set to "TRUE" */
7492     bt_set = TRUE;
7493 
7494     set__bt_set_team( thread->th.th_team, tid, bt_set );
7495     set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7496 #if KMP_USE_MONITOR
7497     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7498                   "bt_intervals=%d, monitor_updates=%d\n",
7499                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7500                   thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7501                   __kmp_monitor_wakeups));
7502 #else
7503     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7504                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7505                   thread->th.th_team->t.t_id, tid, blocktime));
7506 #endif
7507 }
7508 
7509 void
7510 __kmp_aux_set_defaults(
7511     char const * str,
7512     int          len
7513 ) {
7514     if ( ! __kmp_init_serial ) {
7515         __kmp_serial_initialize();
7516     };
7517     __kmp_env_initialize( str );
7518 
7519     if (__kmp_settings
7520 #if OMP_40_ENABLED
7521         || __kmp_display_env || __kmp_display_env_verbose
7522 #endif // OMP_40_ENABLED
7523         ) {
7524         __kmp_env_print();
7525     }
7526 } // __kmp_aux_set_defaults
7527 
7528 /* ------------------------------------------------------------------------ */
7529 
7530 /*
7531  * internal fast reduction routines
7532  */
7533 
7534 PACKED_REDUCTION_METHOD_T
7535 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7536         kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7537         kmp_critical_name *lck )
7538 {
7539 
7540     // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7541     // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7542     // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7543     // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7544 
7545     PACKED_REDUCTION_METHOD_T retval;
7546 
7547     int team_size;
7548 
7549     KMP_DEBUG_ASSERT( loc );    // it would be nice to test ( loc != 0 )
7550     KMP_DEBUG_ASSERT( lck );    // it would be nice to test ( lck != 0 )
7551 
7552     #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7553     #define FAST_REDUCTION_TREE_METHOD_GENERATED   ( ( reduce_data ) && ( reduce_func ) )
7554 
7555     retval = critical_reduce_block;
7556 
7557     team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7558 
7559     if( team_size == 1 ) {
7560 
7561         retval = empty_reduce_block;
7562 
7563     } else {
7564 
7565         int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7566         int tree_available   = FAST_REDUCTION_TREE_METHOD_GENERATED;
7567 
7568         #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7569 
7570             #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7571 
7572 	    int teamsize_cutoff = 4;
7573 
7574 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7575                 if( __kmp_mic_type != non_mic ) {
7576                     teamsize_cutoff = 8;
7577                 }
7578 #endif
7579                 if( tree_available ) {
7580                     if( team_size <= teamsize_cutoff ) {
7581                         if ( atomic_available ) {
7582                             retval = atomic_reduce_block;
7583                         }
7584                     } else {
7585                         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7586                     }
7587                 } else if ( atomic_available ) {
7588                     retval = atomic_reduce_block;
7589                 }
7590             #else
7591                 #error "Unknown or unsupported OS"
7592             #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7593 
7594         #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7595 
7596             #if KMP_OS_LINUX || KMP_OS_WINDOWS
7597 
7598                 // basic tuning
7599 
7600                 if( atomic_available ) {
7601                     if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7602                         retval = atomic_reduce_block;
7603                     }
7604                 } // otherwise: use critical section
7605 
7606             #elif KMP_OS_DARWIN
7607 
7608                 if( atomic_available && ( num_vars <= 3 ) ) {
7609                         retval = atomic_reduce_block;
7610                 } else if( tree_available ) {
7611                     if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7612                         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7613                     }
7614                 } // otherwise: use critical section
7615 
7616             #else
7617                 #error "Unknown or unsupported OS"
7618             #endif
7619 
7620         #else
7621             #error "Unknown or unsupported architecture"
7622         #endif
7623 
7624     }
7625 
7626     // KMP_FORCE_REDUCTION
7627 
7628     // If the team is serialized (team_size == 1), ignore the forced reduction
7629     // method and stay with the unsynchronized method (empty_reduce_block)
7630     if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7631 
7632         PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7633 
7634         int atomic_available, tree_available;
7635 
7636         switch( ( forced_retval = __kmp_force_reduction_method ) )
7637         {
7638         case critical_reduce_block:
7639                 KMP_ASSERT( lck );              // lck should be != 0
7640                 break;
7641 
7642             case atomic_reduce_block:
7643                 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7644                 if( ! atomic_available ) {
7645                     KMP_WARNING(RedMethodNotSupported, "atomic");
7646                     forced_retval = critical_reduce_block;
7647                 }
7648                 break;
7649 
7650             case tree_reduce_block:
7651                 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7652                 if( ! tree_available ) {
7653                     KMP_WARNING(RedMethodNotSupported, "tree");
7654                     forced_retval = critical_reduce_block;
7655                 } else {
7656                     #if KMP_FAST_REDUCTION_BARRIER
7657                     forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7658                     #endif
7659                 }
7660                 break;
7661 
7662             default:
7663                 KMP_ASSERT( 0 ); // "unsupported method specified"
7664         }
7665 
7666         retval = forced_retval;
7667     }
7668 
7669     KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7670 
7671     #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7672     #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7673 
7674     return ( retval );
7675 }
7676 
7677 // this function is for testing set/get/determine reduce method
7678 kmp_int32
7679 __kmp_get_reduce_method( void ) {
7680     return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7681 }
7682 
7683 /* ------------------------------------------------------------------------ */
7684