1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_affinity.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
45 #endif /* defined(KMP_GOMP_COMPAT) */
46 
47 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
48 #if OMP_45_ENABLED
49     "4.5 (201511)";
50 #elif OMP_40_ENABLED
51     "4.0 (201307)";
52 #else
53     "3.1 (201107)";
54 #endif
55 
56 #ifdef KMP_DEBUG
57 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
58 #endif /* KMP_DEBUG */
59 
60 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
61 
62 /* ------------------------------------------------------------------------ */
63 /* ------------------------------------------------------------------------ */
64 
65 kmp_info_t __kmp_monitor;
66 
67 /* ------------------------------------------------------------------------ */
68 /* ------------------------------------------------------------------------ */
69 
70 /* Forward declarations */
71 
72 void __kmp_cleanup( void );
73 
74 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
75 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
76 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
77 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
78 #endif
79 static void __kmp_do_serial_initialize( void );
80 void __kmp_fork_barrier( int gtid, int tid );
81 void __kmp_join_barrier( int gtid );
82 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
86 #endif
87 
88 static int __kmp_expand_threads(int nWish, int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread( int gtid );
91 #endif
92 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
93 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
94 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* ------------------------------------------------------------------------ */
97 /* ------------------------------------------------------------------------ */
98 
99 /* Calculate the identifier of the current thread */
100 /* fast (and somewhat portable) way to get unique */
101 /* identifier of executing thread.                */
102 /* returns KMP_GTID_DNE if we haven't been assigned a gtid   */
103 
104 int
105 __kmp_get_global_thread_id( )
106 {
107     int i;
108     kmp_info_t   **other_threads;
109     size_t         stack_data;
110     char          *stack_addr;
111     size_t         stack_size;
112     char          *stack_base;
113 
114     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
115                       __kmp_nth, __kmp_all_nth ));
116 
117     /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
118              parallel region, made it return KMP_GTID_DNE to force serial_initialize by
119              caller.  Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
120              __kmp_init_gtid for this to work.  */
121 
122     if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
123 
124 #ifdef KMP_TDATA_GTID
125     if ( TCR_4(__kmp_gtid_mode) >= 3) {
126         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
127         return __kmp_gtid;
128     }
129 #endif
130     if ( TCR_4(__kmp_gtid_mode) >= 2) {
131         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
132         return __kmp_gtid_get_specific();
133     }
134     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
135 
136     stack_addr    = (char*) & stack_data;
137     other_threads = __kmp_threads;
138 
139     /*
140         ATT: The code below is a source of potential bugs due to unsynchronized access to
141         __kmp_threads array. For example:
142             1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
143             2. Current thread is suspended by OS.
144             3. Another thread unregisters and finishes (debug versions of free() may fill memory
145                with something like 0xEF).
146             4. Current thread is resumed.
147             5. Current thread reads junk from *thr.
148         TODO: Fix it.
149         --ln
150     */
151 
152     for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
153 
154         kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
155         if( !thr ) continue;
156 
157         stack_size =  (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
158         stack_base =  (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
159 
160         /* stack grows down -- search through all of the active threads */
161 
162         if( stack_addr <= stack_base ) {
163             size_t stack_diff = stack_base - stack_addr;
164 
165             if( stack_diff <= stack_size ) {
166                 /* The only way we can be closer than the allocated */
167                 /* stack size is if we are running on this thread. */
168                 KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
169                 return i;
170             }
171         }
172     }
173 
174     /* get specific to try and determine our gtid */
175     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
176                       "thread, using TLS\n" ));
177     i = __kmp_gtid_get_specific();
178 
179     /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
180 
181     /* if we havn't been assigned a gtid, then return code */
182     if( i<0 ) return i;
183 
184     /* dynamically updated stack window for uber threads to avoid get_specific call */
185     if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
186         KMP_FATAL( StackOverflow, i );
187     }
188 
189     stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
190     if( stack_addr > stack_base ) {
191         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
192         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
193           other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
194     } else {
195         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
196     }
197 
198     /* Reprint stack bounds for ubermaster since they have been refined */
199     if ( __kmp_storage_map ) {
200         char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
201         char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202         __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
203                                       other_threads[i]->th.th_info.ds.ds_stacksize,
204                                       "th_%d stack (refinement)", i );
205     }
206     return i;
207 }
208 
209 int
210 __kmp_get_global_thread_id_reg( )
211 {
212     int gtid;
213 
214     if ( !__kmp_init_serial ) {
215         gtid = KMP_GTID_DNE;
216     } else
217 #ifdef KMP_TDATA_GTID
218     if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
219         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
220         gtid = __kmp_gtid;
221     } else
222 #endif
223     if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
224         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
225         gtid = __kmp_gtid_get_specific();
226     } else {
227         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
228         gtid = __kmp_get_global_thread_id();
229     }
230 
231     /* we must be a new uber master sibling thread */
232     if( gtid == KMP_GTID_DNE ) {
233         KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
234                         "Registering a new gtid.\n" ));
235         __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
236         if( !__kmp_init_serial ) {
237             __kmp_do_serial_initialize();
238             gtid = __kmp_gtid_get_specific();
239         } else {
240             gtid = __kmp_register_root(FALSE);
241         }
242         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
243         /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244     }
245 
246     KMP_DEBUG_ASSERT( gtid >=0 );
247 
248     return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void
253 __kmp_check_stack_overlap( kmp_info_t *th )
254 {
255     int f;
256     char *stack_beg = NULL;
257     char *stack_end = NULL;
258     int gtid;
259 
260     KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
261     if ( __kmp_storage_map ) {
262         stack_end = (char *) th->th.th_info.ds.ds_stackbase;
263         stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
264 
265         gtid = __kmp_gtid_from_thread( th );
266 
267         if (gtid == KMP_GTID_MONITOR) {
268             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
269                                      "th_%s stack (%s)", "mon",
270                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
271         } else {
272             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273                                      "th_%d stack (%s)", gtid,
274                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
275         }
276     }
277 
278     /* No point in checking ubermaster threads since they use refinement and cannot overlap */
279     gtid = __kmp_gtid_from_thread( th );
280     if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
281     {
282         KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
283         if ( stack_beg == NULL ) {
284             stack_end = (char *) th->th.th_info.ds.ds_stackbase;
285             stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
286         }
287 
288         for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
289             kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
290 
291             if( f_th && f_th != th ) {
292                 char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
293                 char *other_stack_beg = other_stack_end -
294                                         (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
295                 if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
296                    (stack_end > other_stack_beg && stack_end < other_stack_end)) {
297 
298                     /* Print the other stack values before the abort */
299                     if ( __kmp_storage_map )
300                         __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
301                             (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
302                             "th_%d stack (overlapped)",
303                                                  __kmp_gtid_from_thread( f_th ) );
304 
305                     __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
306                 }
307             }
308         }
309     }
310     KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
311 }
312 
313 
314 /* ------------------------------------------------------------------------ */
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void
319 __kmp_infinite_loop( void )
320 {
321     static int done = FALSE;
322 
323     while (! done) {
324         KMP_YIELD( 1 );
325     }
326 }
327 
328 #define MAX_MESSAGE     512
329 
330 void
331 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
332     char buffer[MAX_MESSAGE];
333     va_list ap;
334 
335     va_start( ap, format);
336     KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
337     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
338     __kmp_vprintf( kmp_err, buffer, ap );
339 #if KMP_PRINT_DATA_PLACEMENT
340     int node;
341     if(gtid >= 0) {
342         if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
343             if( __kmp_storage_map_verbose ) {
344                 node = __kmp_get_host_node(p1);
345                 if(node < 0)  /* doesn't work, so don't try this next time */
346                     __kmp_storage_map_verbose = FALSE;
347                 else {
348                     char *last;
349                     int lastNode;
350                     int localProc = __kmp_get_cpu_from_gtid(gtid);
351 
352                     const int page_size = KMP_GET_PAGE_SIZE();
353 
354                     p1 = (void *)( (size_t)p1 & ~((size_t)page_size - 1) );
355                     p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)page_size - 1) );
356                     if(localProc >= 0)
357                         __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid, localProc>>1);
358                     else
359                         __kmp_printf_no_lock("  GTID %d\n", gtid);
360 # if KMP_USE_PRCTL
361 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
362                     do {
363                         last = p1;
364                         lastNode = node;
365                         /* This loop collates adjacent pages with the same host node. */
366                         do {
367                             (char*)p1 += page_size;
368                         } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
369                         __kmp_printf_no_lock("    %p-%p memNode %d\n", last,
370                                              (char*)p1 - 1, lastNode);
371                     } while(p1 <= p2);
372 # else
373                     __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
374                                          (char*)p1 + (page_size - 1), __kmp_get_host_node(p1));
375                     if(p1 < p2)  {
376                         __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
377                                              (char*)p2 + (page_size - 1), __kmp_get_host_node(p2));
378                     }
379 # endif
380                 }
381             }
382         } else
383             __kmp_printf_no_lock("  %s\n", KMP_I18N_STR( StorageMapWarning ) );
384     }
385 #endif /* KMP_PRINT_DATA_PLACEMENT */
386     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
387 }
388 
389 void
390 __kmp_warn( char const * format, ... )
391 {
392     char buffer[MAX_MESSAGE];
393     va_list ap;
394 
395     if ( __kmp_generate_warnings == kmp_warnings_off ) {
396         return;
397     }
398 
399     va_start( ap, format );
400 
401     KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
402     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
403     __kmp_vprintf( kmp_err, buffer, ap );
404     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
405 
406     va_end( ap );
407 }
408 
409 void
410 __kmp_abort_process()
411 {
412 
413     // Later threads may stall here, but that's ok because abort() will kill them.
414     __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
415 
416     if ( __kmp_debug_buf ) {
417         __kmp_dump_debug_buffer();
418     }; // if
419 
420     if ( KMP_OS_WINDOWS ) {
421         // Let other threads know of abnormal termination and prevent deadlock
422         // if abort happened during library initialization or shutdown
423         __kmp_global.g.g_abort = SIGABRT;
424 
425         /*
426             On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
427             Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
428             works well, but this function is not available in VS7 (this is not problem for DLL, but
429             it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
430             not help, at least in some versions of MS C RTL.
431 
432             It seems following sequence is the only way to simulate abort() and avoid pop-up error
433             box.
434         */
435         raise( SIGABRT );
436         _exit( 3 );    // Just in case, if signal ignored, exit anyway.
437     } else {
438         abort();
439     }; // if
440 
441     __kmp_infinite_loop();
442     __kmp_release_bootstrap_lock( & __kmp_exit_lock );
443 
444 } // __kmp_abort_process
445 
446 void
447 __kmp_abort_thread( void )
448 {
449     // TODO: Eliminate g_abort global variable and this function.
450     // In case of abort just call abort(), it will kill all the threads.
451     __kmp_infinite_loop();
452 } // __kmp_abort_thread
453 
454 /* ------------------------------------------------------------------------ */
455 
456 /*
457  * Print out the storage map for the major kmp_info_t thread data structures
458  * that are allocated together.
459  */
460 
461 static void
462 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
463 {
464     __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
465 
466     __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
467                              "th_%d.th_info", gtid );
468 
469     __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
470                              "th_%d.th_local", gtid );
471 
472     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473                              sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
474 
475     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
476                              &thr->th.th_bar[bs_plain_barrier+1],
477                              sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
478 
479     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480                              &thr->th.th_bar[bs_forkjoin_barrier+1],
481                              sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
482 
483     #if KMP_FAST_REDUCTION_BARRIER
484         __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
485                              &thr->th.th_bar[bs_reduction_barrier+1],
486                              sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
487     #endif // KMP_FAST_REDUCTION_BARRIER
488 }
489 
490 /*
491  * Print out the storage map for the major kmp_team_t team data structures
492  * that are allocated together.
493  */
494 
495 static void
496 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
497 {
498     int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499     __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500                              header, team_id );
501 
502     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
503                              sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
504 
505 
506     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
507                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
508 
509     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
510                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
511 
512     #if KMP_FAST_REDUCTION_BARRIER
513         __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
514                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
515     #endif // KMP_FAST_REDUCTION_BARRIER
516 
517     __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
518                              sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
519 
520     __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521                              sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
522 
523     __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
524                              sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
525                              header, team_id );
526 
527 
528     __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
529                              sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
530 }
531 
532 static void __kmp_init_allocator() {}
533 static void __kmp_fini_allocator() {}
534 
535 /* ------------------------------------------------------------------------ */
536 
537 #ifdef KMP_DYNAMIC_LIB
538 # if KMP_OS_WINDOWS
539 
540 static void
541 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
542     // TODO: Change to __kmp_break_bootstrap_lock().
543     __kmp_init_bootstrap_lock( lck ); // make the lock released
544 }
545 
546 static void
547 __kmp_reset_locks_on_process_detach( int gtid_req ) {
548     int i;
549     int thread_count;
550 
551     // PROCESS_DETACH is expected to be called by a thread
552     // that executes ProcessExit() or FreeLibrary().
553     // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
554     // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
555     // However, in fact, some threads can be still alive here, although being about to be terminated.
556     // The threads in the array with ds_thread==0 are most suspicious.
557     // Actually, it can be not safe to access the __kmp_threads[].
558 
559     // TODO: does it make sense to check __kmp_roots[] ?
560 
561     // Let's check that there are no other alive threads registered with the OMP lib.
562     while( 1 ) {
563         thread_count = 0;
564         for( i = 0; i < __kmp_threads_capacity; ++i ) {
565             if( !__kmp_threads ) continue;
566             kmp_info_t* th = __kmp_threads[ i ];
567             if( th == NULL ) continue;
568             int gtid = th->th.th_info.ds.ds_gtid;
569             if( gtid == gtid_req ) continue;
570             if( gtid < 0 ) continue;
571             DWORD exit_val;
572             int alive = __kmp_is_thread_alive( th, &exit_val );
573             if( alive ) {
574             ++thread_count;
575             }
576         }
577         if( thread_count == 0 ) break; // success
578     }
579 
580     // Assume that I'm alone.
581 
582     // Now it might be probably safe to check and reset locks.
583     // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
584     __kmp_reset_lock( &__kmp_forkjoin_lock );
585     #ifdef KMP_DEBUG
586     __kmp_reset_lock( &__kmp_stdio_lock );
587     #endif // KMP_DEBUG
588 }
589 
590 BOOL WINAPI
591 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
592     //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
593 
594     switch( fdwReason ) {
595 
596         case DLL_PROCESS_ATTACH:
597             KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
598 
599             return TRUE;
600 
601         case DLL_PROCESS_DETACH:
602             KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
603                         __kmp_gtid_get_specific() ));
604 
605             if( lpReserved != NULL )
606             {
607                 // lpReserved is used for telling the difference:
608                 //  lpReserved == NULL when FreeLibrary() was called,
609                 //  lpReserved != NULL when the process terminates.
610                 // When FreeLibrary() is called, worker threads remain alive.
611                 // So they will release the forkjoin lock by themselves.
612                 // When the process terminates, worker threads disappear triggering
613                 // the problem of unreleased forkjoin lock as described below.
614 
615                 // A worker thread can take the forkjoin lock.
616                 // The problem comes up if that worker thread becomes dead
617                 // before it releases the forkjoin lock.
618                 // The forkjoin lock remains taken, while the thread
619                 // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
620                 // will try to take the forkjoin lock and will always fail,
621                 // so that the application will never finish [normally].
622                 // This scenario is possible if __kmpc_end() has not been executed.
623                 // It looks like it's not a corner case, but common cases:
624                 // - the main function was compiled by an alternative compiler;
625                 // - the main function was compiled by icl but without /Qopenmp (application with plugins);
626                 // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
627                 // - alive foreign thread prevented __kmpc_end from doing cleanup.
628 
629                 // This is a hack to work around the problem.
630                 // TODO: !!! to figure out something better.
631                 __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
632             }
633 
634             __kmp_internal_end_library( __kmp_gtid_get_specific() );
635 
636             return TRUE;
637 
638         case DLL_THREAD_ATTACH:
639             KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
640 
641             /* if we wanted to register new siblings all the time here call
642              * __kmp_get_gtid(); */
643             return TRUE;
644 
645         case DLL_THREAD_DETACH:
646             KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
647                         __kmp_gtid_get_specific() ));
648 
649             __kmp_internal_end_thread( __kmp_gtid_get_specific() );
650             return TRUE;
651     }
652 
653     return TRUE;
654 }
655 
656 # endif /* KMP_OS_WINDOWS */
657 #endif /* KMP_DYNAMIC_LIB */
658 
659 
660 /* ------------------------------------------------------------------------ */
661 
662 /* Change the library type to "status" and return the old type */
663 /* called from within initialization routines where __kmp_initz_lock is held */
664 int
665 __kmp_change_library( int status )
666 {
667     int old_status;
668 
669     old_status = __kmp_yield_init & 1;  // check whether KMP_LIBRARY=throughput (even init count)
670 
671     if (status) {
672         __kmp_yield_init |= 1;  // throughput => turnaround (odd init count)
673     }
674     else {
675         __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
676     }
677 
678     return old_status;  // return previous setting of whether KMP_LIBRARY=throughput
679 }
680 
681 /* ------------------------------------------------------------------------ */
682 /* ------------------------------------------------------------------------ */
683 
684 /* __kmp_parallel_deo --
685  * Wait until it's our turn.
686  */
687 void
688 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
689 {
690     int gtid = *gtid_ref;
691 #ifdef BUILD_PARALLEL_ORDERED
692     kmp_team_t *team = __kmp_team_from_gtid( gtid );
693 #endif /* BUILD_PARALLEL_ORDERED */
694 
695     if( __kmp_env_consistency_check ) {
696         if( __kmp_threads[gtid]->th.th_root->r.r_active )
697 #if KMP_USE_DYNAMIC_LOCK
698             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
699 #else
700             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
701 #endif
702     }
703 #ifdef BUILD_PARALLEL_ORDERED
704     if( !team->t.t_serialized ) {
705         KMP_MB();
706         KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
707         KMP_MB();
708     }
709 #endif /* BUILD_PARALLEL_ORDERED */
710 }
711 
712 /* __kmp_parallel_dxo --
713  * Signal the next task.
714  */
715 
716 void
717 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
718 {
719     int gtid = *gtid_ref;
720 #ifdef BUILD_PARALLEL_ORDERED
721     int tid =  __kmp_tid_from_gtid( gtid );
722     kmp_team_t *team = __kmp_team_from_gtid( gtid );
723 #endif /* BUILD_PARALLEL_ORDERED */
724 
725     if( __kmp_env_consistency_check ) {
726         if( __kmp_threads[gtid]->th.th_root->r.r_active )
727             __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
728     }
729 #ifdef BUILD_PARALLEL_ORDERED
730     if ( ! team->t.t_serialized ) {
731         KMP_MB();       /* Flush all pending memory write invalidates.  */
732 
733         /* use the tid of the next thread in this team */
734         /* TODO repleace with general release procedure */
735         team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
736 
737 #if OMPT_SUPPORT && OMPT_BLAME
738         if (ompt_enabled &&
739             ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
740             /* accept blame for "ordered" waiting */
741             kmp_info_t *this_thread = __kmp_threads[gtid];
742             ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
743                 this_thread->th.ompt_thread_info.wait_id);
744         }
745 #endif
746 
747         KMP_MB();       /* Flush all pending memory write invalidates.  */
748     }
749 #endif /* BUILD_PARALLEL_ORDERED */
750 }
751 
752 /* ------------------------------------------------------------------------ */
753 /* ------------------------------------------------------------------------ */
754 
755 /* ------------------------------------------------------------------------ */
756 /* ------------------------------------------------------------------------ */
757 
758 /* The BARRIER for a SINGLE process section is always explicit   */
759 
760 int
761 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
762 {
763     int status;
764     kmp_info_t *th;
765     kmp_team_t *team;
766 
767     if( ! TCR_4(__kmp_init_parallel) )
768         __kmp_parallel_initialize();
769 
770     th   = __kmp_threads[ gtid ];
771     team = th->th.th_team;
772     status = 0;
773 
774     th->th.th_ident = id_ref;
775 
776     if ( team->t.t_serialized ) {
777         status = 1;
778     } else {
779         kmp_int32 old_this = th->th.th_local.this_construct;
780 
781         ++th->th.th_local.this_construct;
782         /* try to set team count to thread count--success means thread got the
783            single block
784         */
785         /* TODO: Should this be acquire or release? */
786         if (team->t.t_construct == old_this) {
787             status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
788                                                  th->th.th_local.this_construct);
789         }
790 #if USE_ITT_BUILD
791         if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
792 #if OMP_40_ENABLED
793             th->th.th_teams_microtask == NULL &&
794 #endif
795             team->t.t_active_level == 1 )
796         {   // Only report metadata by master of active team at level 1
797             __kmp_itt_metadata_single( id_ref );
798         }
799 #endif /* USE_ITT_BUILD */
800     }
801 
802     if( __kmp_env_consistency_check ) {
803         if (status && push_ws) {
804             __kmp_push_workshare( gtid, ct_psingle, id_ref );
805         } else {
806             __kmp_check_workshare( gtid, ct_psingle, id_ref );
807         }
808     }
809 #if USE_ITT_BUILD
810     if ( status ) {
811         __kmp_itt_single_start( gtid );
812     }
813 #endif /* USE_ITT_BUILD */
814     return status;
815 }
816 
817 void
818 __kmp_exit_single( int gtid )
819 {
820 #if USE_ITT_BUILD
821     __kmp_itt_single_end( gtid );
822 #endif /* USE_ITT_BUILD */
823     if( __kmp_env_consistency_check )
824         __kmp_pop_workshare( gtid, ct_psingle, NULL );
825 }
826 
827 
828 /*
829  * determine if we can go parallel or must use a serialized parallel region and
830  * how many threads we can use
831  * set_nproc is the number of threads requested for the team
832  * returns 0 if we should serialize or only use one thread,
833  * otherwise the number of threads to use
834  * The forkjoin lock is held by the caller.
835  */
836 static int
837 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
838    int master_tid, int set_nthreads
839 #if OMP_40_ENABLED
840   , int enter_teams
841 #endif /* OMP_40_ENABLED */
842 )
843 {
844     int capacity;
845     int new_nthreads;
846     KMP_DEBUG_ASSERT( __kmp_init_serial );
847     KMP_DEBUG_ASSERT( root && parent_team );
848 
849     //
850     // If dyn-var is set, dynamically adjust the number of desired threads,
851     // according to the method specified by dynamic_mode.
852     //
853     new_nthreads = set_nthreads;
854     if ( ! get__dynamic_2( parent_team, master_tid ) ) {
855         ;
856     }
857 #ifdef USE_LOAD_BALANCE
858     else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
859         new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
860         if ( new_nthreads == 1 ) {
861             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
862               master_tid ));
863             return 1;
864         }
865         if ( new_nthreads < set_nthreads ) {
866             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
867               master_tid, new_nthreads ));
868         }
869     }
870 #endif /* USE_LOAD_BALANCE */
871     else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
872         new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
873           : root->r.r_hot_team->t.t_nproc);
874         if ( new_nthreads <= 1 ) {
875             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
876               master_tid ));
877             return 1;
878         }
879         if ( new_nthreads < set_nthreads ) {
880             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
881               master_tid, new_nthreads ));
882         }
883         else {
884             new_nthreads = set_nthreads;
885         }
886     }
887     else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
888         if ( set_nthreads > 2 ) {
889             new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
890             new_nthreads = ( new_nthreads % set_nthreads ) + 1;
891             if ( new_nthreads == 1 ) {
892                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
893                   master_tid ));
894                 return 1;
895             }
896             if ( new_nthreads < set_nthreads ) {
897                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
898                   master_tid, new_nthreads ));
899             }
900         }
901     }
902     else {
903         KMP_ASSERT( 0 );
904     }
905 
906     //
907     // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
908     //
909     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
910       root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
911         int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
912           root->r.r_hot_team->t.t_nproc );
913         if ( tl_nthreads <= 0 ) {
914             tl_nthreads = 1;
915         }
916 
917         //
918         // If dyn-var is false, emit a 1-time warning.
919         //
920         if ( ! get__dynamic_2( parent_team, master_tid )
921           && ( ! __kmp_reserve_warn ) ) {
922             __kmp_reserve_warn = 1;
923             __kmp_msg(
924                 kmp_ms_warning,
925                 KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
926                 KMP_HNT( Unset_ALL_THREADS ),
927                 __kmp_msg_null
928             );
929         }
930         if ( tl_nthreads == 1 ) {
931             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
932               master_tid ));
933             return 1;
934         }
935         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
936           master_tid, tl_nthreads ));
937         new_nthreads = tl_nthreads;
938     }
939 
940     //
941     // Check if the threads array is large enough, or needs expanding.
942     //
943     // See comment in __kmp_register_root() about the adjustment if
944     // __kmp_threads[0] == NULL.
945     //
946     capacity = __kmp_threads_capacity;
947     if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
948         --capacity;
949     }
950     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
951       root->r.r_hot_team->t.t_nproc ) > capacity ) {
952         //
953         // Expand the threads array.
954         //
955         int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
956           root->r.r_hot_team->t.t_nproc ) - capacity;
957         int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
958         if ( slotsAdded < slotsRequired ) {
959             //
960             // The threads array was not expanded enough.
961             //
962             new_nthreads -= ( slotsRequired - slotsAdded );
963             KMP_ASSERT( new_nthreads >= 1 );
964 
965             //
966             // If dyn-var is false, emit a 1-time warning.
967             //
968             if ( ! get__dynamic_2( parent_team, master_tid )
969               && ( ! __kmp_reserve_warn ) ) {
970                 __kmp_reserve_warn = 1;
971                 if ( __kmp_tp_cached ) {
972                     __kmp_msg(
973                         kmp_ms_warning,
974                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
975                         KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
976                         KMP_HNT( PossibleSystemLimitOnThreads ),
977                         __kmp_msg_null
978                     );
979                 }
980                 else {
981                     __kmp_msg(
982                         kmp_ms_warning,
983                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
984                         KMP_HNT( SystemLimitOnThreads ),
985                         __kmp_msg_null
986                     );
987                 }
988             }
989         }
990     }
991 
992     if ( new_nthreads == 1 ) {
993         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
994                         __kmp_get_gtid(), set_nthreads ) );
995         return 1;
996     }
997 
998     KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
999                     __kmp_get_gtid(), new_nthreads, set_nthreads ));
1000     return new_nthreads;
1001 }
1002 
1003 /* ------------------------------------------------------------------------ */
1004 /* ------------------------------------------------------------------------ */
1005 
1006 /* allocate threads from the thread pool and assign them to the new team */
1007 /* we are assured that there are enough threads available, because we
1008  * checked on that earlier within critical section forkjoin */
1009 
1010 static void
1011 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1012                          kmp_info_t *master_th, int master_gtid )
1013 {
1014     int         i;
1015     int use_hot_team;
1016 
1017     KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1018     KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1019     KMP_MB();
1020 
1021     /* first, let's setup the master thread */
1022     master_th->th.th_info.ds.ds_tid  = 0;
1023     master_th->th.th_team            = team;
1024     master_th->th.th_team_nproc      = team->t.t_nproc;
1025     master_th->th.th_team_master     = master_th;
1026     master_th->th.th_team_serialized = FALSE;
1027     master_th->th.th_dispatch        = & team->t.t_dispatch[ 0 ];
1028 
1029     /* make sure we are not the optimized hot team */
1030 #if KMP_NESTED_HOT_TEAMS
1031     use_hot_team = 0;
1032     kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1033     if( hot_teams ) {  // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1034         int level = team->t.t_active_level - 1;    // index in array of hot teams
1035         if( master_th->th.th_teams_microtask ) {    // are we inside the teams?
1036             if( master_th->th.th_teams_size.nteams > 1 ) {
1037                 ++level; // level was not increased in teams construct for team_of_masters
1038             }
1039             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1040                 master_th->th.th_teams_level == team->t.t_level ) {
1041                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
1042             }            // team->t.t_level will be increased inside parallel
1043         }
1044         if( level < __kmp_hot_teams_max_level ) {
1045             if( hot_teams[level].hot_team ) {
1046                 // hot team has already been allocated for given level
1047                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1048                 use_hot_team = 1; // the team is ready to use
1049             } else {
1050                 use_hot_team = 0; // AC: threads are not allocated yet
1051                 hot_teams[level].hot_team = team; // remember new hot team
1052                 hot_teams[level].hot_team_nth = team->t.t_nproc;
1053             }
1054         } else {
1055             use_hot_team = 0;
1056         }
1057     }
1058 #else
1059     use_hot_team = team == root->r.r_hot_team;
1060 #endif
1061     if ( !use_hot_team ) {
1062 
1063         /* install the master thread */
1064         team->t.t_threads[ 0 ]    = master_th;
1065         __kmp_initialize_info( master_th, team, 0, master_gtid );
1066 
1067         /* now, install the worker threads */
1068         for ( i=1 ;  i < team->t.t_nproc ; i++ ) {
1069 
1070             /* fork or reallocate a new thread and install it in team */
1071             kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1072             team->t.t_threads[ i ] = thr;
1073             KMP_DEBUG_ASSERT( thr );
1074             KMP_DEBUG_ASSERT( thr->th.th_team == team );
1075             /* align team and thread arrived states */
1076             KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1077                             __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1078                             __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1079                             team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1080                             team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1081 #if OMP_40_ENABLED
1082             thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1083             thr->th.th_teams_level     = master_th->th.th_teams_level;
1084             thr->th.th_teams_size      = master_th->th.th_teams_size;
1085 #endif
1086             { // Initialize threads' barrier data.
1087                 int b;
1088                 kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1089                 for ( b = 0; b < bs_last_barrier; ++ b ) {
1090                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
1091                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1092 #if USE_DEBUGGER
1093                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1094 #endif
1095                 }; // for b
1096             }
1097         }
1098 
1099 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1100         __kmp_partition_places( team );
1101 #endif
1102 
1103     }
1104 
1105     KMP_MB();
1106 }
1107 
1108 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1109 //
1110 // Propagate any changes to the floating point control registers out to the team
1111 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1112 // so we don't make changes unless they are needed.
1113 //
1114 inline static void
1115 propagateFPControl(kmp_team_t * team)
1116 {
1117     if ( __kmp_inherit_fp_control ) {
1118         kmp_int16 x87_fpu_control_word;
1119         kmp_uint32 mxcsr;
1120 
1121         // Get master values of FPU control flags (both X87 and vector)
1122         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1123         __kmp_store_mxcsr( &mxcsr );
1124         mxcsr &= KMP_X86_MXCSR_MASK;
1125 
1126         // There is no point looking at t_fp_control_saved here.
1127         // If it is TRUE, we still have to update the values if they are different from those we now have.
1128         // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1129         // that the values in the team are the same as those we have.
1130         // So, this code achieves what we need whether or not t_fp_control_saved is true.
1131         // By checking whether the value needs updating we avoid unnecessary writes that would put the
1132         // cache-line into a written state, causing all threads in the team to have to read it again.
1133         KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1134         KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1135         // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1136         // So we must ensure it is correct.
1137         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1138     }
1139     else {
1140         // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1141         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1142     }
1143 }
1144 
1145 // Do the opposite, setting the hardware registers to the updated values from the team.
1146 inline static void
1147 updateHWFPControl(kmp_team_t * team)
1148 {
1149     if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1150         //
1151         // Only reset the fp control regs if they have been changed in the team.
1152         // the parallel region that we are exiting.
1153         //
1154         kmp_int16 x87_fpu_control_word;
1155         kmp_uint32 mxcsr;
1156         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1157         __kmp_store_mxcsr( &mxcsr );
1158         mxcsr &= KMP_X86_MXCSR_MASK;
1159 
1160         if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1161             __kmp_clear_x87_fpu_status_word();
1162             __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1163         }
1164 
1165         if ( team->t.t_mxcsr != mxcsr ) {
1166             __kmp_load_mxcsr( &team->t.t_mxcsr );
1167         }
1168     }
1169 }
1170 #else
1171 # define propagateFPControl(x) ((void)0)
1172 # define updateHWFPControl(x)  ((void)0)
1173 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1174 
1175 static void
1176 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1177 
1178 /*
1179  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1180  */
1181 void
1182 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1183 {
1184     kmp_info_t *this_thr;
1185     kmp_team_t *serial_team;
1186 
1187     KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1188 
1189     /* Skip all this code for autopar serialized loops since it results in
1190        unacceptable overhead */
1191     if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1192         return;
1193 
1194     if( ! TCR_4( __kmp_init_parallel ) )
1195         __kmp_parallel_initialize();
1196 
1197     this_thr     = __kmp_threads[ global_tid ];
1198     serial_team  = this_thr->th.th_serial_team;
1199 
1200     /* utilize the serialized team held by this thread */
1201     KMP_DEBUG_ASSERT( serial_team );
1202     KMP_MB();
1203 
1204     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1205         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1206         KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1207         KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1208                         global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1209         this_thr->th.th_task_team = NULL;
1210     }
1211 
1212 #if OMP_40_ENABLED
1213     kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1214     if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1215         proc_bind = proc_bind_false;
1216     }
1217     else if ( proc_bind == proc_bind_default ) {
1218         //
1219         // No proc_bind clause was specified, so use the current value
1220         // of proc-bind-var for this parallel region.
1221         //
1222         proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1223     }
1224     //
1225     // Reset for next parallel region
1226     //
1227     this_thr->th.th_set_proc_bind = proc_bind_default;
1228 #endif /* OMP_40_ENABLED */
1229 
1230     if( this_thr->th.th_team != serial_team ) {
1231         // Nested level will be an index in the nested nthreads array
1232         int level = this_thr->th.th_team->t.t_level;
1233 
1234         if( serial_team->t.t_serialized ) {
1235             /* this serial team was already used
1236              * TODO increase performance by making this locks more specific */
1237             kmp_team_t *new_team;
1238 
1239             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1240 
1241 #if OMPT_SUPPORT
1242             ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1243 #endif
1244 
1245             new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1246 #if OMPT_SUPPORT
1247                                            ompt_parallel_id,
1248 #endif
1249 #if OMP_40_ENABLED
1250                                            proc_bind,
1251 #endif
1252                                            & this_thr->th.th_current_task->td_icvs,
1253                                            0 USE_NESTED_HOT_ARG(NULL) );
1254             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1255             KMP_ASSERT( new_team );
1256 
1257             /* setup new serialized team and install it */
1258             new_team->t.t_threads[0] = this_thr;
1259             new_team->t.t_parent = this_thr->th.th_team;
1260             serial_team = new_team;
1261             this_thr->th.th_serial_team = serial_team;
1262 
1263             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1264                             global_tid, serial_team ) );
1265 
1266 
1267             /* TODO the above breaks the requirement that if we run out of
1268              * resources, then we can still guarantee that serialized teams
1269              * are ok, since we may need to allocate a new one */
1270         } else {
1271             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1272                             global_tid, serial_team ) );
1273         }
1274 
1275         /* we have to initialize this serial team */
1276         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1277         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1278         KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1279         serial_team->t.t_ident         = loc;
1280         serial_team->t.t_serialized    = 1;
1281         serial_team->t.t_nproc         = 1;
1282         serial_team->t.t_parent        = this_thr->th.th_team;
1283         serial_team->t.t_sched         = this_thr->th.th_team->t.t_sched;
1284         this_thr->th.th_team           = serial_team;
1285         serial_team->t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
1286 
1287         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1288                         global_tid, this_thr->th.th_current_task ) );
1289         KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1290         this_thr->th.th_current_task->td_flags.executing = 0;
1291 
1292         __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1293 
1294         /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
1295            each serialized task represented by team->t.t_serialized? */
1296         copy_icvs(
1297                   & this_thr->th.th_current_task->td_icvs,
1298                   & this_thr->th.th_current_task->td_parent->td_icvs );
1299 
1300         // Thread value exists in the nested nthreads array for the next nested level
1301         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1302             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1303         }
1304 
1305 #if OMP_40_ENABLED
1306         if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1307             this_thr->th.th_current_task->td_icvs.proc_bind
1308                 = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1309         }
1310 #endif /* OMP_40_ENABLED */
1311 
1312 #if USE_DEBUGGER
1313         serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1314 #endif
1315         this_thr->th.th_info.ds.ds_tid = 0;
1316 
1317         /* set thread cache values */
1318         this_thr->th.th_team_nproc     = 1;
1319         this_thr->th.th_team_master    = this_thr;
1320         this_thr->th.th_team_serialized = 1;
1321 
1322         serial_team->t.t_level        = serial_team->t.t_parent->t.t_level + 1;
1323         serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1324 
1325         propagateFPControl (serial_team);
1326 
1327         /* check if we need to allocate dispatch buffers stack */
1328         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1329         if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1330             serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1331                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1332         }
1333         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334 
1335 #if OMPT_SUPPORT
1336         ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1337         __ompt_team_assign_id(serial_team, ompt_parallel_id);
1338 #endif
1339 
1340         KMP_MB();
1341 
1342     } else {
1343         /* this serialized team is already being used,
1344          * that's fine, just add another nested level */
1345         KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1346         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1347         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1348         ++ serial_team->t.t_serialized;
1349         this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1350 
1351         // Nested level will be an index in the nested nthreads array
1352         int level = this_thr->th.th_team->t.t_level;
1353         // Thread value exists in the nested nthreads array for the next nested level
1354         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1355             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1356         }
1357         serial_team->t.t_level++;
1358         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1359                         global_tid, serial_team, serial_team->t.t_level ) );
1360 
1361         /* allocate/push dispatch buffers stack */
1362         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1363         {
1364             dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1365                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1366             disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1367             serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1368         }
1369         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1370 
1371         KMP_MB();
1372     }
1373 
1374     if ( __kmp_env_consistency_check )
1375         __kmp_push_parallel( global_tid, NULL );
1376 
1377 }
1378 
1379 /* most of the work for a fork */
1380 /* return true if we really went parallel, false if serialized */
1381 int
1382 __kmp_fork_call(
1383     ident_t   * loc,
1384     int         gtid,
1385     enum fork_context_e  call_context, // Intel, GNU, ...
1386     kmp_int32   argc,
1387 #if OMPT_SUPPORT
1388     void       *unwrapped_task,
1389 #endif
1390     microtask_t microtask,
1391     launch_t    invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394     va_list   * ap
1395 #else
1396     va_list     ap
1397 #endif
1398     )
1399 {
1400     void          **argv;
1401     int             i;
1402     int             master_tid;
1403     int             master_this_cons;
1404     kmp_team_t     *team;
1405     kmp_team_t     *parent_team;
1406     kmp_info_t     *master_th;
1407     kmp_root_t     *root;
1408     int             nthreads;
1409     int             master_active;
1410     int             master_set_numthreads;
1411     int             level;
1412 #if OMP_40_ENABLED
1413     int             active_level;
1414     int             teams_level;
1415 #endif
1416 #if KMP_NESTED_HOT_TEAMS
1417     kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419     { // KMP_TIME_BLOCK
1420     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423     KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1424     if ( __kmp_stkpadding > 0 &&  __kmp_root[gtid] != NULL ) {
1425         /* Some systems prefer the stack for the root thread(s) to start with */
1426         /* some gap from the parent stack to prevent false sharing. */
1427         void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428         /* These 2 lines below are so this does not get optimized out */
1429         if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1430             __kmp_stkpadding += (short)((kmp_int64)dummy);
1431     }
1432 
1433     /* initialize if needed */
1434     KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1435     if( ! TCR_4(__kmp_init_parallel) )
1436         __kmp_parallel_initialize();
1437 
1438     /* setup current data */
1439     master_th     = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1440     parent_team   = master_th->th.th_team;
1441     master_tid    = master_th->th.th_info.ds.ds_tid;
1442     master_this_cons = master_th->th.th_local.this_construct;
1443     root          = master_th->th.th_root;
1444     master_active = root->r.r_active;
1445     master_set_numthreads = master_th->th.th_set_nproc;
1446 
1447 #if OMPT_SUPPORT
1448     ompt_parallel_id_t ompt_parallel_id;
1449     ompt_task_id_t ompt_task_id;
1450     ompt_frame_t *ompt_frame;
1451     ompt_task_id_t my_task_id;
1452     ompt_parallel_id_t my_parallel_id;
1453 
1454     if (ompt_enabled) {
1455         ompt_parallel_id = __ompt_parallel_id_new(gtid);
1456         ompt_task_id = __ompt_get_task_id_internal(0);
1457         ompt_frame = __ompt_get_task_frame_internal(0);
1458     }
1459 #endif
1460 
1461     // Nested level will be an index in the nested nthreads array
1462     level         = parent_team->t.t_level;
1463     active_level  = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1464 #if OMP_40_ENABLED
1465     teams_level    = master_th->th.th_teams_level; // needed to check nesting inside the teams
1466 #endif
1467 #if KMP_NESTED_HOT_TEAMS
1468     p_hot_teams   = &master_th->th.th_hot_teams;
1469     if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1470         *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1471                 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472         (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473         (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1474     }
1475 #endif
1476 
1477 #if OMPT_SUPPORT
1478     if (ompt_enabled &&
1479         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1480         int team_size = master_set_numthreads;
1481 
1482         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1483             ompt_task_id, ompt_frame, ompt_parallel_id,
1484             team_size, unwrapped_task, OMPT_INVOKER(call_context));
1485     }
1486 #endif
1487 
1488     master_th->th.th_ident = loc;
1489 
1490 #if OMP_40_ENABLED
1491     if ( master_th->th.th_teams_microtask &&
1492          ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1493         // AC: This is start of parallel that is nested inside teams construct.
1494         //     The team is actual (hot), all workers are ready at the fork barrier.
1495         //     No lock needed to initialize the team a bit, then free workers.
1496         parent_team->t.t_ident = loc;
1497         __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1498         parent_team->t.t_argc  = argc;
1499         argv = (void**)parent_team->t.t_argv;
1500         for( i=argc-1; i >= 0; --i )
1501 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1502 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1503             *argv++ = va_arg( *ap, void * );
1504 #else
1505             *argv++ = va_arg( ap, void * );
1506 #endif
1507         /* Increment our nested depth levels, but not increase the serialization */
1508         if ( parent_team == master_th->th.th_serial_team ) {
1509             // AC: we are in serialized parallel
1510             __kmpc_serialized_parallel(loc, gtid);
1511             KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1512             parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1513                                            //     work correctly, will restore at join time
1514 
1515 #if OMPT_SUPPORT
1516             void *dummy;
1517             void **exit_runtime_p;
1518 
1519             ompt_lw_taskteam_t lw_taskteam;
1520 
1521             if (ompt_enabled) {
1522                 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1523                     unwrapped_task, ompt_parallel_id);
1524                 lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1525                 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1526 
1527                 __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1528 
1529 #if OMPT_TRACE
1530                 /* OMPT implicit task begin */
1531                 my_task_id = lw_taskteam.ompt_task_info.task_id;
1532                 my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1533                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1534                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1535                         my_parallel_id, my_task_id);
1536                 }
1537 #endif
1538 
1539                 /* OMPT state */
1540                 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1541             } else {
1542                 exit_runtime_p = &dummy;
1543             }
1544 #endif
1545 
1546             {
1547                 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1548                 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1549                 __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1550 #if OMPT_SUPPORT
1551                                         , exit_runtime_p
1552 #endif
1553                                         );
1554             }
1555 
1556 #if OMPT_SUPPORT
1557             *exit_runtime_p = NULL;
1558             if (ompt_enabled) {
1559 #if OMPT_TRACE
1560                 lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1561 
1562                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1563                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1564                         ompt_parallel_id, ompt_task_id);
1565                 }
1566 
1567                 __ompt_lw_taskteam_unlink(master_th);
1568                 // reset clear the task id only after unlinking the task
1569                 lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1570 #endif
1571 
1572                 if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1573                     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1574                         ompt_parallel_id, ompt_task_id,
1575                         OMPT_INVOKER(call_context));
1576                 }
1577                 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1578             }
1579 #endif
1580             return TRUE;
1581         }
1582 
1583         parent_team->t.t_pkfn  = microtask;
1584 #if OMPT_SUPPORT
1585         parent_team->t.ompt_team_info.microtask = unwrapped_task;
1586 #endif
1587         parent_team->t.t_invoke = invoker;
1588         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1589         parent_team->t.t_active_level ++;
1590         parent_team->t.t_level ++;
1591 
1592         /* Change number of threads in the team if requested */
1593         if ( master_set_numthreads ) {   // The parallel has num_threads clause
1594             if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1595                 // AC: only can reduce the number of threads dynamically, cannot increase
1596                 kmp_info_t **other_threads = parent_team->t.t_threads;
1597                 parent_team->t.t_nproc = master_set_numthreads;
1598                 for ( i = 0; i < master_set_numthreads; ++i ) {
1599                     other_threads[i]->th.th_team_nproc = master_set_numthreads;
1600                 }
1601                 // Keep extra threads hot in the team for possible next parallels
1602             }
1603             master_th->th.th_set_nproc = 0;
1604         }
1605 
1606 #if USE_DEBUGGER
1607     if ( __kmp_debugging ) {    // Let debugger override number of threads.
1608         int nth = __kmp_omp_num_threads( loc );
1609         if ( nth > 0 ) {        // 0 means debugger does not want to change number of threads.
1610             master_set_numthreads = nth;
1611         }; // if
1612     }; // if
1613 #endif
1614 
1615         KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1616         __kmp_internal_fork( loc, gtid, parent_team );
1617         KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1618 
1619         /* Invoke microtask for MASTER thread */
1620         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1621                     gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1622 
1623         {
1624             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1625             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1626             if (! parent_team->t.t_invoke( gtid )) {
1627                 KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1628             }
1629         }
1630         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1631             gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1632         KMP_MB();       /* Flush all pending memory write invalidates.  */
1633 
1634         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1635 
1636         return TRUE;
1637     } // Parallel closely nested in teams construct
1638 #endif /* OMP_40_ENABLED */
1639 
1640 #if KMP_DEBUG
1641     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1642         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1643     }
1644 #endif
1645 
1646     if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1647         nthreads = 1;
1648     } else {
1649 #if OMP_40_ENABLED
1650         int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1651 #endif
1652         nthreads = master_set_numthreads ?
1653             master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1654 
1655         // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1656         // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1657         if (nthreads > 1) {
1658             if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1659 #if OMP_40_ENABLED
1660                 && !enter_teams
1661 #endif /* OMP_40_ENABLED */
1662             ) ) || ( __kmp_library == library_serial ) ) {
1663                 KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1664                                 gtid, nthreads ));
1665                 nthreads = 1;
1666             }
1667         }
1668         if ( nthreads > 1 ) {
1669             /* determine how many new threads we can use */
1670             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1671 
1672             nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1673 #if OMP_40_ENABLED
1674 /* AC: If we execute teams from parallel region (on host), then teams should be created
1675    but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1676    then teams and their threads should be created regardless of the nesting setting. */
1677                                          , enter_teams
1678 #endif /* OMP_40_ENABLED */
1679                                          );
1680             if ( nthreads == 1 ) {
1681                 // Free lock for single thread execution here;
1682                 // for multi-thread execution it will be freed later
1683                 // after team of threads created and initialized
1684                 __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1685             }
1686         }
1687     }
1688     KMP_DEBUG_ASSERT( nthreads > 0 );
1689 
1690     /* If we temporarily changed the set number of threads then restore it now */
1691     master_th->th.th_set_nproc = 0;
1692 
1693     /* create a serialized parallel region? */
1694     if ( nthreads == 1 ) {
1695         /* josh todo: hypothetical question: what do we do for OS X*? */
1696 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1697         void *   args[ argc ];
1698 #else
1699         void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1700 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1701 
1702         KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1703 
1704         __kmpc_serialized_parallel(loc, gtid);
1705 
1706         if ( call_context == fork_context_intel ) {
1707             /* TODO this sucks, use the compiler itself to pass args! :) */
1708             master_th->th.th_serial_team->t.t_ident = loc;
1709 #if OMP_40_ENABLED
1710             if ( !ap ) {
1711                 // revert change made in __kmpc_serialized_parallel()
1712                 master_th->th.th_serial_team->t.t_level--;
1713                 // Get args from parent team for teams construct
1714 
1715 #if OMPT_SUPPORT
1716                 void *dummy;
1717                 void **exit_runtime_p;
1718 
1719                 ompt_lw_taskteam_t lw_taskteam;
1720 
1721                 if (ompt_enabled) {
1722                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1723                         unwrapped_task, ompt_parallel_id);
1724                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1725                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1726 
1727                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1728 
1729 #if OMPT_TRACE
1730                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1731                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1732                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1733                             ompt_parallel_id, my_task_id);
1734                     }
1735 #endif
1736 
1737                     /* OMPT state */
1738                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1739                 } else {
1740                     exit_runtime_p = &dummy;
1741                 }
1742 #endif
1743 
1744                 {
1745                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1746                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1747                     __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1748 #if OMPT_SUPPORT
1749                         , exit_runtime_p
1750 #endif
1751                     );
1752                 }
1753 
1754 #if OMPT_SUPPORT
1755                 *exit_runtime_p = NULL;
1756                 if (ompt_enabled) {
1757                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1758 
1759 #if OMPT_TRACE
1760                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1761                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1762                             ompt_parallel_id, ompt_task_id);
1763                     }
1764 #endif
1765 
1766                     __ompt_lw_taskteam_unlink(master_th);
1767                     // reset clear the task id only after unlinking the task
1768                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1769 
1770                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1771                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1772                             ompt_parallel_id, ompt_task_id,
1773                             OMPT_INVOKER(call_context));
1774                     }
1775                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1776                 }
1777 #endif
1778             } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1779                 KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1780                 team = master_th->th.th_team;
1781                 //team->t.t_pkfn = microtask;
1782                 team->t.t_invoke = invoker;
1783                 __kmp_alloc_argv_entries( argc, team, TRUE );
1784                 team->t.t_argc = argc;
1785                 argv = (void**) team->t.t_argv;
1786                 if ( ap ) {
1787                     for( i=argc-1; i >= 0; --i )
1788 // TODO: revert workaround for Intel(R) 64 tracker #96
1789 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1790                         *argv++ = va_arg( *ap, void * );
1791 # else
1792                         *argv++ = va_arg( ap, void * );
1793 # endif
1794                 } else {
1795                     for( i=0; i < argc; ++i )
1796                         // Get args from parent team for teams construct
1797                         argv[i] = parent_team->t.t_argv[i];
1798                 }
1799                 // AC: revert change made in __kmpc_serialized_parallel()
1800                 //     because initial code in teams should have level=0
1801                 team->t.t_level--;
1802                 // AC: call special invoker for outer "parallel" of the teams construct
1803                 {
1804                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1805                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1806                     invoker(gtid);
1807                 }
1808             } else {
1809 #endif /* OMP_40_ENABLED */
1810                 argv = args;
1811                 for( i=argc-1; i >= 0; --i )
1812 // TODO: revert workaround for Intel(R) 64 tracker #96
1813 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1814                     *argv++ = va_arg( *ap, void * );
1815 #else
1816                     *argv++ = va_arg( ap, void * );
1817 #endif
1818                 KMP_MB();
1819 
1820 #if OMPT_SUPPORT
1821                 void *dummy;
1822                 void **exit_runtime_p;
1823 
1824                 ompt_lw_taskteam_t lw_taskteam;
1825 
1826                 if (ompt_enabled) {
1827                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1828                         unwrapped_task, ompt_parallel_id);
1829                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1830                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1831 
1832                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1833 
1834 #if OMPT_TRACE
1835                     /* OMPT implicit task begin */
1836                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1837                     my_parallel_id = ompt_parallel_id;
1838                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1839                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1840                             my_parallel_id, my_task_id);
1841                     }
1842 #endif
1843 
1844                     /* OMPT state */
1845                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1846                 } else {
1847                     exit_runtime_p = &dummy;
1848                 }
1849 #endif
1850 
1851                 {
1852                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1853                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1854                     __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1855 #if OMPT_SUPPORT
1856                         , exit_runtime_p
1857 #endif
1858                     );
1859                 }
1860 
1861 #if OMPT_SUPPORT
1862                 *exit_runtime_p = NULL;
1863                 if (ompt_enabled) {
1864 #if OMPT_TRACE
1865                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1866 
1867                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1868                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1869                             my_parallel_id, my_task_id);
1870                     }
1871 #endif
1872 
1873                     __ompt_lw_taskteam_unlink(master_th);
1874                     // reset clear the task id only after unlinking the task
1875                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1876 
1877                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1878                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1879                             ompt_parallel_id, ompt_task_id,
1880                             OMPT_INVOKER(call_context));
1881                     }
1882                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1883                 }
1884 #endif
1885 #if OMP_40_ENABLED
1886             }
1887 #endif /* OMP_40_ENABLED */
1888         }
1889         else if ( call_context == fork_context_gnu ) {
1890 #if OMPT_SUPPORT
1891             ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1892                 __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1893             __ompt_lw_taskteam_init(lwt, master_th, gtid,
1894                 unwrapped_task, ompt_parallel_id);
1895 
1896             lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1897             lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1898             __ompt_lw_taskteam_link(lwt, master_th);
1899 #endif
1900 
1901             // we were called from GNU native code
1902             KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1903             return FALSE;
1904         }
1905         else {
1906             KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1907         }
1908 
1909 
1910         KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1911         KMP_MB();
1912         return FALSE;
1913     }
1914 
1915     // GEH: only modify the executing flag in the case when not serialized
1916     //      serialized case is handled in kmpc_serialized_parallel
1917     KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1918                   parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1919                   master_th->th.th_current_task->td_icvs.max_active_levels ) );
1920     // TODO: GEH - cannot do this assertion because root thread not set up as executing
1921     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1922     master_th->th.th_current_task->td_flags.executing = 0;
1923 
1924 #if OMP_40_ENABLED
1925     if ( !master_th->th.th_teams_microtask || level > teams_level )
1926 #endif /* OMP_40_ENABLED */
1927     {
1928         /* Increment our nested depth level */
1929         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1930     }
1931 
1932     // See if we need to make a copy of the ICVs.
1933     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1934     if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1935         nthreads_icv = __kmp_nested_nth.nth[level+1];
1936     }
1937     else {
1938         nthreads_icv = 0;  // don't update
1939     }
1940 
1941 #if OMP_40_ENABLED
1942     // Figure out the proc_bind_policy for the new team.
1943     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1944     kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1945     if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1946         proc_bind = proc_bind_false;
1947     }
1948     else {
1949         if (proc_bind == proc_bind_default) {
1950             // No proc_bind clause specified; use current proc-bind-var for this parallel region
1951             proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1952         }
1953         /* else: The proc_bind policy was specified explicitly on parallel clause. This
1954            overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1955         // Figure the value of proc-bind-var for the child threads.
1956         if ((level+1 < __kmp_nested_proc_bind.used)
1957             && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1958             proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1959         }
1960     }
1961 
1962     // Reset for next parallel region
1963     master_th->th.th_set_proc_bind = proc_bind_default;
1964 #endif /* OMP_40_ENABLED */
1965 
1966     if ((nthreads_icv > 0)
1967 #if OMP_40_ENABLED
1968         || (proc_bind_icv != proc_bind_default)
1969 #endif /* OMP_40_ENABLED */
1970         ) {
1971         kmp_internal_control_t new_icvs;
1972         copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1973         new_icvs.next = NULL;
1974         if (nthreads_icv > 0) {
1975             new_icvs.nproc = nthreads_icv;
1976         }
1977 
1978 #if OMP_40_ENABLED
1979         if (proc_bind_icv != proc_bind_default) {
1980             new_icvs.proc_bind = proc_bind_icv;
1981         }
1982 #endif /* OMP_40_ENABLED */
1983 
1984         /* allocate a new parallel team */
1985         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1986         team = __kmp_allocate_team(root, nthreads, nthreads,
1987 #if OMPT_SUPPORT
1988                                    ompt_parallel_id,
1989 #endif
1990 #if OMP_40_ENABLED
1991                                    proc_bind,
1992 #endif
1993                                    &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1994     } else {
1995         /* allocate a new parallel team */
1996         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1997         team = __kmp_allocate_team(root, nthreads, nthreads,
1998 #if OMPT_SUPPORT
1999                                    ompt_parallel_id,
2000 #endif
2001 #if OMP_40_ENABLED
2002                                    proc_bind,
2003 #endif
2004                                    &master_th->th.th_current_task->td_icvs, argc
2005                                    USE_NESTED_HOT_ARG(master_th) );
2006     }
2007     KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2008 
2009     /* setup the new team */
2010     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2011     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2012     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2013     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2014     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2015 #if OMPT_SUPPORT
2016     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2017 #endif
2018     KMP_CHECK_UPDATE(team->t.t_invoke, invoker);  /* TODO move this to root, maybe */
2019     // TODO: parent_team->t.t_level == INT_MAX ???
2020 #if OMP_40_ENABLED
2021     if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2022 #endif /* OMP_40_ENABLED */
2023         int new_level = parent_team->t.t_level + 1;
2024         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2025         new_level = parent_team->t.t_active_level + 1;
2026         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2027 #if OMP_40_ENABLED
2028     } else {
2029         // AC: Do not increase parallel level at start of the teams construct
2030         int new_level = parent_team->t.t_level;
2031         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2032         new_level = parent_team->t.t_active_level;
2033         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2034     }
2035 #endif /* OMP_40_ENABLED */
2036     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2037     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2038         team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2039 
2040 #if OMP_40_ENABLED
2041     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2042 #endif
2043 
2044     // Update the floating point rounding in the team if required.
2045     propagateFPControl(team);
2046 
2047     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2048         // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2049 #if 0
2050         // Patch out an assertion that trips while the runtime seems to operate correctly.
2051         // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2052         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2053 #endif
2054         KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2055                       __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2056                       parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2057 
2058         if ( active_level || master_th->th.th_task_team ) {
2059             // Take a memo of master's task_state
2060             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2061             if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2062                 kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2063                 kmp_uint8 *old_stack, *new_stack;
2064                 kmp_uint32 i;
2065                 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2066                 for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2067                     new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2068                 }
2069                 for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2070                     new_stack[i] = 0;
2071                 }
2072                 old_stack = master_th->th.th_task_state_memo_stack;
2073                 master_th->th.th_task_state_memo_stack = new_stack;
2074                 master_th->th.th_task_state_stack_sz = new_size;
2075                 __kmp_free(old_stack);
2076             }
2077             // Store master's task_state on stack
2078             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2079             master_th->th.th_task_state_top++;
2080 #if KMP_NESTED_HOT_TEAMS
2081             if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2082                 master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2083             }
2084             else {
2085 #endif
2086                 master_th->th.th_task_state = 0;
2087 #if KMP_NESTED_HOT_TEAMS
2088             }
2089 #endif
2090         }
2091 #if !KMP_NESTED_HOT_TEAMS
2092         KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2093 #endif
2094     }
2095 
2096     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2097                 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2098     KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2099                       ( team->t.t_master_tid == 0 &&
2100                         ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2101     KMP_MB();
2102 
2103     /* now, setup the arguments */
2104     argv = (void**)team->t.t_argv;
2105 #if OMP_40_ENABLED
2106     if ( ap ) {
2107 #endif /* OMP_40_ENABLED */
2108         for ( i=argc-1; i >= 0; --i ) {
2109 // TODO: revert workaround for Intel(R) 64 tracker #96
2110 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2111             void *new_argv = va_arg(*ap, void *);
2112 #else
2113             void *new_argv = va_arg(ap, void *);
2114 #endif
2115             KMP_CHECK_UPDATE(*argv, new_argv);
2116             argv++;
2117         }
2118 #if OMP_40_ENABLED
2119     } else {
2120         for ( i=0; i < argc; ++i ) {
2121             // Get args from parent team for teams construct
2122             KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2123         }
2124     }
2125 #endif /* OMP_40_ENABLED */
2126 
2127     /* now actually fork the threads */
2128     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2129     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2130         root->r.r_active = TRUE;
2131 
2132     __kmp_fork_team_threads( root, team, master_th, gtid );
2133     __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2134 
2135 #if OMPT_SUPPORT
2136     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2137 #endif
2138 
2139     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2140 
2141 #if USE_ITT_BUILD
2142     if ( team->t.t_active_level == 1 // only report frames at level 1
2143 # if OMP_40_ENABLED
2144         && !master_th->th.th_teams_microtask // not in teams construct
2145 # endif /* OMP_40_ENABLED */
2146     ) {
2147 #if USE_ITT_NOTIFY
2148         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2149              ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2150         {
2151             kmp_uint64 tmp_time = 0;
2152             if ( __itt_get_timestamp_ptr )
2153                 tmp_time = __itt_get_timestamp();
2154             // Internal fork - report frame begin
2155             master_th->th.th_frame_time  = tmp_time;
2156             if ( __kmp_forkjoin_frames_mode == 3 )
2157                 team->t.t_region_time = tmp_time;
2158         } else // only one notification scheme (either "submit" or "forking/joined", not both)
2159 #endif /* USE_ITT_NOTIFY */
2160         if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2161              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2162         { // Mark start of "parallel" region for VTune.
2163             __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2164         }
2165     }
2166 #endif /* USE_ITT_BUILD */
2167 
2168     /* now go on and do the work */
2169     KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2170     KMP_MB();
2171     KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2172                   root, team, master_th, gtid));
2173 
2174 #if USE_ITT_BUILD
2175     if ( __itt_stack_caller_create_ptr ) {
2176         team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2177     }
2178 #endif /* USE_ITT_BUILD */
2179 
2180 #if OMP_40_ENABLED
2181     if ( ap )   // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2182 #endif /* OMP_40_ENABLED */
2183     {
2184         __kmp_internal_fork( loc, gtid, team );
2185         KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2186                       root, team, master_th, gtid));
2187     }
2188 
2189     if (call_context == fork_context_gnu) {
2190         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2191         return TRUE;
2192     }
2193 
2194     /* Invoke microtask for MASTER thread */
2195     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2196                 gtid, team->t.t_id, team->t.t_pkfn ) );
2197     }  // END of timer KMP_fork_call block
2198 
2199     {
2200         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2201         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2202         if (! team->t.t_invoke( gtid )) {
2203             KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2204         }
2205     }
2206     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2207         gtid, team->t.t_id, team->t.t_pkfn ) );
2208     KMP_MB();       /* Flush all pending memory write invalidates.  */
2209 
2210     KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2211 
2212 #if OMPT_SUPPORT
2213     if (ompt_enabled) {
2214         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2215     }
2216 #endif
2217 
2218     return TRUE;
2219 }
2220 
2221 #if OMPT_SUPPORT
2222 static inline void
2223 __kmp_join_restore_state(
2224     kmp_info_t *thread,
2225     kmp_team_t *team)
2226 {
2227     // restore state outside the region
2228     thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2229         ompt_state_work_serial : ompt_state_work_parallel);
2230 }
2231 
2232 static inline void
2233 __kmp_join_ompt(
2234     kmp_info_t *thread,
2235     kmp_team_t *team,
2236     ompt_parallel_id_t parallel_id,
2237     fork_context_e fork_context)
2238 {
2239     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2240     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2241         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2242             parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2243     }
2244 
2245     task_info->frame.reenter_runtime_frame = NULL;
2246     __kmp_join_restore_state(thread,team);
2247 }
2248 #endif
2249 
2250 void
2251 __kmp_join_call(ident_t *loc, int gtid
2252 #if OMPT_SUPPORT
2253                , enum fork_context_e fork_context
2254 #endif
2255 #if OMP_40_ENABLED
2256                , int exit_teams
2257 #endif /* OMP_40_ENABLED */
2258 )
2259 {
2260     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2261     kmp_team_t     *team;
2262     kmp_team_t     *parent_team;
2263     kmp_info_t     *master_th;
2264     kmp_root_t     *root;
2265     int             master_active;
2266     int             i;
2267 
2268     KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2269 
2270     /* setup current data */
2271     master_th     = __kmp_threads[ gtid ];
2272     root          = master_th->th.th_root;
2273     team          = master_th->th.th_team;
2274     parent_team   = team->t.t_parent;
2275 
2276     master_th->th.th_ident = loc;
2277 
2278 #if OMPT_SUPPORT
2279     if (ompt_enabled) {
2280         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2281     }
2282 #endif
2283 
2284 #if KMP_DEBUG
2285     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2286         KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2287                          __kmp_gtid_from_thread( master_th ), team,
2288                          team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2289         KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2290     }
2291 #endif
2292 
2293     if( team->t.t_serialized ) {
2294 #if OMP_40_ENABLED
2295         if ( master_th->th.th_teams_microtask ) {
2296             // We are in teams construct
2297             int level = team->t.t_level;
2298             int tlevel = master_th->th.th_teams_level;
2299             if ( level == tlevel ) {
2300                 // AC: we haven't incremented it earlier at start of teams construct,
2301                 //     so do it here - at the end of teams construct
2302                 team->t.t_level++;
2303             } else if ( level == tlevel + 1 ) {
2304                 // AC: we are exiting parallel inside teams, need to increment serialization
2305                 //     in order to restore it in the next call to __kmpc_end_serialized_parallel
2306                 team->t.t_serialized++;
2307             }
2308         }
2309 #endif /* OMP_40_ENABLED */
2310         __kmpc_end_serialized_parallel( loc, gtid );
2311 
2312 #if OMPT_SUPPORT
2313         if (ompt_enabled) {
2314             __kmp_join_restore_state(master_th, parent_team);
2315         }
2316 #endif
2317 
2318         return;
2319     }
2320 
2321     master_active = team->t.t_master_active;
2322 
2323 #if OMP_40_ENABLED
2324     if (!exit_teams)
2325 #endif /* OMP_40_ENABLED */
2326     {
2327         // AC: No barrier for internal teams at exit from teams construct.
2328         //     But there is barrier for external team (league).
2329         __kmp_internal_join( loc, gtid, team );
2330     }
2331 #if OMP_40_ENABLED
2332     else {
2333         master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2334     }
2335 #endif /* OMP_40_ENABLED */
2336 
2337     KMP_MB();
2338 
2339 #if OMPT_SUPPORT
2340     ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2341 #endif
2342 
2343 #if USE_ITT_BUILD
2344     if ( __itt_stack_caller_create_ptr ) {
2345         __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2346     }
2347 
2348     // Mark end of "parallel" region for VTune.
2349     if ( team->t.t_active_level == 1
2350 # if OMP_40_ENABLED
2351         && !master_th->th.th_teams_microtask /* not in teams construct */
2352 # endif /* OMP_40_ENABLED */
2353     ) {
2354         master_th->th.th_ident = loc;
2355         // only one notification scheme (either "submit" or "forking/joined", not both)
2356         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2357             __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2358                                     0, loc, master_th->th.th_team_nproc, 1 );
2359         else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2360             ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2361             __kmp_itt_region_joined( gtid );
2362     } // active_level == 1
2363 #endif /* USE_ITT_BUILD */
2364 
2365 #if OMP_40_ENABLED
2366     if ( master_th->th.th_teams_microtask &&
2367          !exit_teams &&
2368          team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2369          team->t.t_level == master_th->th.th_teams_level + 1 ) {
2370         // AC: We need to leave the team structure intact at the end
2371         //     of parallel inside the teams construct, so that at the next
2372         //     parallel same (hot) team works, only adjust nesting levels
2373 
2374         /* Decrement our nested depth level */
2375         team->t.t_level --;
2376         team->t.t_active_level --;
2377         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2378 
2379         /* Restore number of threads in the team if needed */
2380         if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2381             int old_num = master_th->th.th_team_nproc;
2382             int new_num = master_th->th.th_teams_size.nth;
2383             kmp_info_t **other_threads = team->t.t_threads;
2384             team->t.t_nproc = new_num;
2385             for ( i = 0; i < old_num; ++i ) {
2386                 other_threads[i]->th.th_team_nproc = new_num;
2387             }
2388             // Adjust states of non-used threads of the team
2389             for ( i = old_num; i < new_num; ++i ) {
2390                 // Re-initialize thread's barrier data.
2391                 int b;
2392                 kmp_balign_t * balign = other_threads[i]->th.th_bar;
2393                 for ( b = 0; b < bs_last_barrier; ++ b ) {
2394                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
2395                     KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2396 #if USE_DEBUGGER
2397                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2398 #endif
2399                 }
2400                 if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2401                     // Synchronize thread's task state
2402                     other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2403                 }
2404             }
2405         }
2406 
2407 #if OMPT_SUPPORT
2408         if (ompt_enabled) {
2409             __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2410         }
2411 #endif
2412 
2413         return;
2414     }
2415 #endif /* OMP_40_ENABLED */
2416 
2417     /* do cleanup and restore the parent team */
2418     master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2419     master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2420 
2421     master_th->th.th_dispatch =
2422                 & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2423 
2424     /* jc: The following lock has instructions with REL and ACQ semantics,
2425        separating the parallel user code called in this parallel region
2426        from the serial user code called after this function returns.
2427     */
2428     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2429 
2430 #if OMP_40_ENABLED
2431     if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2432 #endif /* OMP_40_ENABLED */
2433     {
2434         /* Decrement our nested depth level */
2435         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2436     }
2437     KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2438 
2439 #if OMPT_SUPPORT && OMPT_TRACE
2440     if(ompt_enabled){
2441         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2442         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2443              ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2444                parallel_id, task_info->task_id);
2445         }
2446         task_info->frame.exit_runtime_frame = NULL;
2447         task_info->task_id = 0;
2448     }
2449 #endif
2450 
2451     KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2452                    0, master_th, team ) );
2453     __kmp_pop_current_task_from_thread( master_th );
2454 
2455 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2456     //
2457     // Restore master thread's partition.
2458     //
2459     master_th->th.th_first_place = team->t.t_first_place;
2460     master_th->th.th_last_place = team->t.t_last_place;
2461 #endif /* OMP_40_ENABLED */
2462 
2463     updateHWFPControl (team);
2464 
2465     if ( root->r.r_active != master_active )
2466         root->r.r_active = master_active;
2467 
2468     __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2469 
2470     /* this race was fun to find.  make sure the following is in the critical
2471      * region otherwise assertions may fail occasionally since the old team
2472      * may be reallocated and the hierarchy appears inconsistent.  it is
2473      * actually safe to run and won't cause any bugs, but will cause those
2474      * assertion failures.  it's only one deref&assign so might as well put this
2475      * in the critical region */
2476     master_th->th.th_team        =   parent_team;
2477     master_th->th.th_team_nproc  =   parent_team->t.t_nproc;
2478     master_th->th.th_team_master =   parent_team->t.t_threads[0];
2479     master_th->th.th_team_serialized = parent_team->t.t_serialized;
2480 
2481     /* restore serialized team, if need be */
2482     if( parent_team->t.t_serialized &&
2483         parent_team != master_th->th.th_serial_team &&
2484         parent_team != root->r.r_root_team ) {
2485             __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2486             master_th->th.th_serial_team = parent_team;
2487     }
2488 
2489     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2490         if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2491             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2492             // Remember master's state if we re-use this nested hot team
2493             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2494             --master_th->th.th_task_state_top; // pop
2495             // Now restore state at this level
2496             master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2497         }
2498         // Copy the task team from the parent team to the master thread
2499         master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2500         KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2501                         __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2502     }
2503 
2504      // TODO: GEH - cannot do this assertion because root thread not set up as executing
2505      // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2506      master_th->th.th_current_task->td_flags.executing = 1;
2507 
2508     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2509 
2510 #if OMPT_SUPPORT
2511     if (ompt_enabled) {
2512         __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2513     }
2514 #endif
2515 
2516     KMP_MB();
2517     KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2518 }
2519 
2520 /* ------------------------------------------------------------------------ */
2521 /* ------------------------------------------------------------------------ */
2522 
2523 /* Check whether we should push an internal control record onto the
2524    serial team stack.  If so, do it.  */
2525 void
2526 __kmp_save_internal_controls ( kmp_info_t * thread )
2527 {
2528 
2529     if ( thread->th.th_team != thread->th.th_serial_team ) {
2530         return;
2531     }
2532     if (thread->th.th_team->t.t_serialized > 1) {
2533         int push = 0;
2534 
2535         if (thread->th.th_team->t.t_control_stack_top == NULL) {
2536             push = 1;
2537         } else {
2538             if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2539                  thread->th.th_team->t.t_serialized ) {
2540                 push = 1;
2541             }
2542         }
2543         if (push) {  /* push a record on the serial team's stack */
2544             kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2545 
2546             copy_icvs( control, & thread->th.th_current_task->td_icvs );
2547 
2548             control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2549 
2550             control->next = thread->th.th_team->t.t_control_stack_top;
2551             thread->th.th_team->t.t_control_stack_top = control;
2552         }
2553     }
2554 }
2555 
2556 /* Changes set_nproc */
2557 void
2558 __kmp_set_num_threads( int new_nth, int gtid )
2559 {
2560     kmp_info_t *thread;
2561     kmp_root_t *root;
2562 
2563     KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2564     KMP_DEBUG_ASSERT( __kmp_init_serial );
2565 
2566     if (new_nth < 1)
2567         new_nth = 1;
2568     else if (new_nth > __kmp_max_nth)
2569         new_nth = __kmp_max_nth;
2570 
2571     KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2572     thread = __kmp_threads[gtid];
2573 
2574     __kmp_save_internal_controls( thread );
2575 
2576     set__nproc( thread, new_nth );
2577 
2578     //
2579     // If this omp_set_num_threads() call will cause the hot team size to be
2580     // reduced (in the absence of a num_threads clause), then reduce it now,
2581     // rather than waiting for the next parallel region.
2582     //
2583     root = thread->th.th_root;
2584     if ( __kmp_init_parallel && ( ! root->r.r_active )
2585       && ( root->r.r_hot_team->t.t_nproc > new_nth )
2586 #if KMP_NESTED_HOT_TEAMS
2587       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2588 #endif
2589     ) {
2590         kmp_team_t *hot_team = root->r.r_hot_team;
2591         int f;
2592 
2593         __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2594 
2595         // Release the extra threads we don't need any more.
2596         for ( f = new_nth;  f < hot_team->t.t_nproc; f++ ) {
2597             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2598             if ( __kmp_tasking_mode != tskm_immediate_exec) {
2599                 // When decreasing team size, threads no longer in the team should unref task team.
2600                 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2601             }
2602             __kmp_free_thread( hot_team->t.t_threads[f] );
2603             hot_team->t.t_threads[f] =  NULL;
2604         }
2605         hot_team->t.t_nproc = new_nth;
2606 #if KMP_NESTED_HOT_TEAMS
2607         if( thread->th.th_hot_teams ) {
2608             KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2609             thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2610         }
2611 #endif
2612 
2613         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2614 
2615         //
2616         // Update the t_nproc field in the threads that are still active.
2617         //
2618         for( f=0 ; f < new_nth; f++ ) {
2619             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2620             hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2621         }
2622         // Special flag in case omp_set_num_threads() call
2623         hot_team->t.t_size_changed = -1;
2624     }
2625 }
2626 
2627 /* Changes max_active_levels */
2628 void
2629 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2630 {
2631     kmp_info_t *thread;
2632 
2633     KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2634     KMP_DEBUG_ASSERT( __kmp_init_serial );
2635 
2636     // validate max_active_levels
2637     if( max_active_levels < 0 ) {
2638         KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2639         // We ignore this call if the user has specified a negative value.
2640         // The current setting won't be changed. The last valid setting will be used.
2641         // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2642         KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2643         return;
2644     }
2645     if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2646         // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2647         // We allow a zero value. (implementation defined behavior)
2648     } else {
2649         KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT  );
2650         max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2651         // Current upper limit is MAX_INT. (implementation defined behavior)
2652         // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2653         // Actually, the flow should never get here until we use MAX_INT limit.
2654     }
2655     KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2656 
2657     thread = __kmp_threads[ gtid ];
2658 
2659     __kmp_save_internal_controls( thread );
2660 
2661     set__max_active_levels( thread, max_active_levels );
2662 
2663 }
2664 
2665 /* Gets max_active_levels */
2666 int
2667 __kmp_get_max_active_levels( int gtid )
2668 {
2669     kmp_info_t *thread;
2670 
2671     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2672     KMP_DEBUG_ASSERT( __kmp_init_serial );
2673 
2674     thread = __kmp_threads[ gtid ];
2675     KMP_DEBUG_ASSERT( thread->th.th_current_task );
2676     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2677         gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2678     return thread->th.th_current_task->td_icvs.max_active_levels;
2679 }
2680 
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682 void
2683 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2684 {
2685     kmp_info_t *thread;
2686 //    kmp_team_t *team;
2687 
2688     KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2689     KMP_DEBUG_ASSERT( __kmp_init_serial );
2690 
2691     // Check if the kind parameter is valid, correct if needed.
2692     // Valid parameters should fit in one of two intervals - standard or extended:
2693     //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694     // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2695     if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2696        ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2697     {
2698         // TODO: Hint needs attention in case we change the default schedule.
2699         __kmp_msg(
2700             kmp_ms_warning,
2701             KMP_MSG( ScheduleKindOutOfRange, kind ),
2702             KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2703             __kmp_msg_null
2704         );
2705         kind = kmp_sched_default;
2706         chunk = 0;         // ignore chunk value in case of bad kind
2707     }
2708 
2709     thread = __kmp_threads[ gtid ];
2710 
2711     __kmp_save_internal_controls( thread );
2712 
2713     if ( kind < kmp_sched_upper_std ) {
2714         if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2715             // differ static chunked vs. unchunked:
2716             // chunk should be invalid to indicate unchunked schedule (which is the default)
2717             thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2718         } else {
2719             thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2720         }
2721     } else {
2722         //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2723         thread->th.th_current_task->td_icvs.sched.r_sched_type =
2724             __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2725     }
2726     if ( kind == kmp_sched_auto ) {
2727         // ignore parameter chunk for schedule auto
2728         thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2729     } else {
2730         thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2731     }
2732 }
2733 
2734 /* Gets def_sched_var ICV values */
2735 void
2736 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2737 {
2738     kmp_info_t     *thread;
2739     enum sched_type th_type;
2740 
2741     KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2742     KMP_DEBUG_ASSERT( __kmp_init_serial );
2743 
2744     thread = __kmp_threads[ gtid ];
2745 
2746     th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2747 
2748     switch ( th_type ) {
2749     case kmp_sch_static:
2750     case kmp_sch_static_greedy:
2751     case kmp_sch_static_balanced:
2752         *kind = kmp_sched_static;
2753         *chunk = 0;   // chunk was not set, try to show this fact via zero value
2754         return;
2755     case kmp_sch_static_chunked:
2756         *kind = kmp_sched_static;
2757         break;
2758     case kmp_sch_dynamic_chunked:
2759         *kind = kmp_sched_dynamic;
2760         break;
2761     case kmp_sch_guided_chunked:
2762     case kmp_sch_guided_iterative_chunked:
2763     case kmp_sch_guided_analytical_chunked:
2764         *kind = kmp_sched_guided;
2765         break;
2766     case kmp_sch_auto:
2767         *kind = kmp_sched_auto;
2768         break;
2769     case kmp_sch_trapezoidal:
2770         *kind = kmp_sched_trapezoidal;
2771         break;
2772 #if KMP_STATIC_STEAL_ENABLED
2773     case kmp_sch_static_steal:
2774         *kind = kmp_sched_static_steal;
2775         break;
2776 #endif
2777     default:
2778         KMP_FATAL( UnknownSchedulingType, th_type );
2779     }
2780 
2781     *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2782 }
2783 
2784 int
2785 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2786 
2787     int ii, dd;
2788     kmp_team_t *team;
2789     kmp_info_t *thr;
2790 
2791     KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2792     KMP_DEBUG_ASSERT( __kmp_init_serial );
2793 
2794     // validate level
2795     if( level == 0 ) return 0;
2796     if( level < 0 ) return -1;
2797     thr = __kmp_threads[ gtid ];
2798     team = thr->th.th_team;
2799     ii = team->t.t_level;
2800     if( level > ii ) return -1;
2801 
2802 #if OMP_40_ENABLED
2803     if( thr->th.th_teams_microtask ) {
2804         // AC: we are in teams region where multiple nested teams have same level
2805         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2806         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2807             KMP_DEBUG_ASSERT( ii >= tlevel );
2808             // AC: As we need to pass by the teams league, we need to artificially increase ii
2809             if ( ii == tlevel ) {
2810                 ii += 2; // three teams have same level
2811             } else {
2812                 ii ++;   // two teams have same level
2813             }
2814         }
2815     }
2816 #endif
2817 
2818     if( ii == level ) return __kmp_tid_from_gtid( gtid );
2819 
2820     dd = team->t.t_serialized;
2821     level++;
2822     while( ii > level )
2823     {
2824         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2825         {
2826         }
2827         if( ( team->t.t_serialized ) && ( !dd ) ) {
2828             team = team->t.t_parent;
2829             continue;
2830         }
2831         if( ii > level ) {
2832             team = team->t.t_parent;
2833             dd = team->t.t_serialized;
2834             ii--;
2835         }
2836     }
2837 
2838     return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2839 }
2840 
2841 int
2842 __kmp_get_team_size( int gtid, int level ) {
2843 
2844     int ii, dd;
2845     kmp_team_t *team;
2846     kmp_info_t *thr;
2847 
2848     KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2849     KMP_DEBUG_ASSERT( __kmp_init_serial );
2850 
2851     // validate level
2852     if( level == 0 ) return 1;
2853     if( level < 0 ) return -1;
2854     thr = __kmp_threads[ gtid ];
2855     team = thr->th.th_team;
2856     ii = team->t.t_level;
2857     if( level > ii ) return -1;
2858 
2859 #if OMP_40_ENABLED
2860     if( thr->th.th_teams_microtask ) {
2861         // AC: we are in teams region where multiple nested teams have same level
2862         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2863         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2864             KMP_DEBUG_ASSERT( ii >= tlevel );
2865             // AC: As we need to pass by the teams league, we need to artificially increase ii
2866             if ( ii == tlevel ) {
2867                 ii += 2; // three teams have same level
2868             } else {
2869                 ii ++;   // two teams have same level
2870             }
2871         }
2872     }
2873 #endif
2874 
2875     while( ii > level )
2876     {
2877         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2878         {
2879         }
2880         if( team->t.t_serialized && ( !dd ) ) {
2881             team = team->t.t_parent;
2882             continue;
2883         }
2884         if( ii > level ) {
2885             team = team->t.t_parent;
2886             ii--;
2887         }
2888     }
2889 
2890     return team->t.t_nproc;
2891 }
2892 
2893 kmp_r_sched_t
2894 __kmp_get_schedule_global() {
2895 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2896 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2897 
2898     kmp_r_sched_t r_sched;
2899 
2900     // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2901     // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2902     // and thus have different run-time schedules in different roots (even in OMP 2.5)
2903     if ( __kmp_sched == kmp_sch_static ) {
2904         r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2905     } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2906         r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2907     } else {
2908         r_sched.r_sched_type = __kmp_sched;  // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2909     }
2910 
2911     if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2912         r_sched.chunk = KMP_DEFAULT_CHUNK;
2913     } else {
2914         r_sched.chunk = __kmp_chunk;
2915     }
2916 
2917     return r_sched;
2918 }
2919 
2920 /* ------------------------------------------------------------------------ */
2921 /* ------------------------------------------------------------------------ */
2922 
2923 
2924 /*
2925  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2926  * at least argc number of *t_argv entries for the requested team.
2927  */
2928 static void
2929 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2930 {
2931 
2932     KMP_DEBUG_ASSERT( team );
2933     if( !realloc || argc > team->t.t_max_argc ) {
2934 
2935         KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2936                          team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2937         /* if previously allocated heap space for args, free them */
2938         if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2939             __kmp_free( (void *) team->t.t_argv );
2940 
2941         if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2942             /* use unused space in the cache line for arguments */
2943             team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2944             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2945                              team->t.t_id, team->t.t_max_argc ));
2946             team->t.t_argv = &team->t.t_inline_argv[0];
2947             if ( __kmp_storage_map ) {
2948                 __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2949                                          &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2950                                          (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2951                                          "team_%d.t_inline_argv",
2952                                          team->t.t_id );
2953             }
2954         } else {
2955             /* allocate space for arguments in the heap */
2956             team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2957                                      KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2958             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2959                              team->t.t_id, team->t.t_max_argc ));
2960             team->t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2961             if ( __kmp_storage_map ) {
2962                 __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2963                                          sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2964                                          team->t.t_id );
2965             }
2966         }
2967     }
2968 }
2969 
2970 static void
2971 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2972 {
2973     int i;
2974     int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2975     team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2976     team->t.t_disp_buffer = (dispatch_shared_info_t*)
2977         __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2978     team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2979     team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2980     team->t.t_max_nproc = max_nth;
2981 
2982     /* setup dispatch buffers */
2983     for(i = 0 ; i < num_disp_buff; ++i) {
2984         team->t.t_disp_buffer[i].buffer_index = i;
2985 #if OMP_45_ENABLED
2986         team->t.t_disp_buffer[i].doacross_buf_idx = i;
2987 #endif
2988     }
2989 }
2990 
2991 static void
2992 __kmp_free_team_arrays(kmp_team_t *team) {
2993     /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2994     int i;
2995     for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2996         if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2997             __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2998             team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2999         }; // if
3000     }; // for
3001     __kmp_free(team->t.t_threads);
3002     __kmp_free(team->t.t_disp_buffer);
3003     __kmp_free(team->t.t_dispatch);
3004     __kmp_free(team->t.t_implicit_task_taskdata);
3005     team->t.t_threads     = NULL;
3006     team->t.t_disp_buffer = NULL;
3007     team->t.t_dispatch    = NULL;
3008     team->t.t_implicit_task_taskdata = 0;
3009 }
3010 
3011 static void
3012 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3013     kmp_info_t **oldThreads = team->t.t_threads;
3014 
3015     __kmp_free(team->t.t_disp_buffer);
3016     __kmp_free(team->t.t_dispatch);
3017     __kmp_free(team->t.t_implicit_task_taskdata);
3018     __kmp_allocate_team_arrays(team, max_nth);
3019 
3020     KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3021 
3022     __kmp_free(oldThreads);
3023 }
3024 
3025 static kmp_internal_control_t
3026 __kmp_get_global_icvs( void ) {
3027 
3028     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3029 
3030 #if OMP_40_ENABLED
3031     KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3032 #endif /* OMP_40_ENABLED */
3033 
3034     kmp_internal_control_t g_icvs = {
3035       0,                            //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3036       (kmp_int8)__kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
3037       (kmp_int8)__kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
3038       (kmp_int8)__kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
3039       __kmp_dflt_blocktime,         //int blocktime;            //internal control for blocktime
3040 #if KMP_USE_MONITOR
3041       __kmp_bt_intervals,           //int bt_intervals;         //internal control for blocktime intervals
3042 #endif
3043       __kmp_dflt_team_nth,          //int nproc;                //internal control for # of threads for next parallel region (per thread)
3044                                     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3045       __kmp_dflt_max_active_levels, //int max_active_levels;    //internal control for max_active_levels
3046       r_sched,                      //kmp_r_sched_t sched;      //internal control for runtime schedule {sched,chunk} pair
3047 #if OMP_40_ENABLED
3048       __kmp_nested_proc_bind.bind_types[0],
3049       __kmp_default_device,
3050 #endif /* OMP_40_ENABLED */
3051       NULL                          //struct kmp_internal_control *next;
3052     };
3053 
3054     return g_icvs;
3055 }
3056 
3057 static kmp_internal_control_t
3058 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3059 
3060     kmp_internal_control_t gx_icvs;
3061     gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3062     copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3063     gx_icvs.next = NULL;
3064 
3065     return gx_icvs;
3066 }
3067 
3068 static void
3069 __kmp_initialize_root( kmp_root_t *root )
3070 {
3071     int           f;
3072     kmp_team_t   *root_team;
3073     kmp_team_t   *hot_team;
3074     int           hot_team_max_nth;
3075     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3076     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3077     KMP_DEBUG_ASSERT( root );
3078     KMP_ASSERT( ! root->r.r_begin );
3079 
3080     /* setup the root state structure */
3081     __kmp_init_lock( &root->r.r_begin_lock );
3082     root->r.r_begin        = FALSE;
3083     root->r.r_active       = FALSE;
3084     root->r.r_in_parallel  = 0;
3085     root->r.r_blocktime    = __kmp_dflt_blocktime;
3086     root->r.r_nested       = __kmp_dflt_nested;
3087 
3088     /* setup the root team for this task */
3089     /* allocate the root team structure */
3090     KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3091 
3092     root_team =
3093         __kmp_allocate_team(
3094             root,
3095             1,                                                         // new_nproc
3096             1,                                                         // max_nproc
3097 #if OMPT_SUPPORT
3098             0, // root parallel id
3099 #endif
3100 #if OMP_40_ENABLED
3101             __kmp_nested_proc_bind.bind_types[0],
3102 #endif
3103             &r_icvs,
3104             0                                                          // argc
3105             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3106         );
3107 #if USE_DEBUGGER
3108     // Non-NULL value should be assigned to make the debugger display the root team.
3109     TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3110 #endif
3111 
3112     KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3113 
3114     root->r.r_root_team = root_team;
3115     root_team->t.t_control_stack_top = NULL;
3116 
3117     /* initialize root team */
3118     root_team->t.t_threads[0] = NULL;
3119     root_team->t.t_nproc      = 1;
3120     root_team->t.t_serialized = 1;
3121     // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3122     root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3123     root_team->t.t_sched.chunk        = r_sched.chunk;
3124     KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3125                     root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3126 
3127     /* setup the  hot team for this task */
3128     /* allocate the hot team structure */
3129     KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3130 
3131     hot_team =
3132         __kmp_allocate_team(
3133             root,
3134             1,                                                         // new_nproc
3135             __kmp_dflt_team_nth_ub * 2,                                // max_nproc
3136 #if OMPT_SUPPORT
3137             0, // root parallel id
3138 #endif
3139 #if OMP_40_ENABLED
3140             __kmp_nested_proc_bind.bind_types[0],
3141 #endif
3142             &r_icvs,
3143             0                                                          // argc
3144             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3145         );
3146     KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3147 
3148     root->r.r_hot_team = hot_team;
3149     root_team->t.t_control_stack_top = NULL;
3150 
3151     /* first-time initialization */
3152     hot_team->t.t_parent = root_team;
3153 
3154     /* initialize hot team */
3155     hot_team_max_nth = hot_team->t.t_max_nproc;
3156     for ( f = 0; f < hot_team_max_nth; ++ f ) {
3157         hot_team->t.t_threads[ f ] = NULL;
3158     }; // for
3159     hot_team->t.t_nproc = 1;
3160     // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3161     hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3162     hot_team->t.t_sched.chunk        = r_sched.chunk;
3163     hot_team->t.t_size_changed = 0;
3164 }
3165 
3166 #ifdef KMP_DEBUG
3167 
3168 
3169 typedef struct kmp_team_list_item {
3170     kmp_team_p const *           entry;
3171     struct kmp_team_list_item *  next;
3172 } kmp_team_list_item_t;
3173 typedef kmp_team_list_item_t * kmp_team_list_t;
3174 
3175 
3176 static void
3177 __kmp_print_structure_team_accum(    // Add team to list of teams.
3178     kmp_team_list_t     list,        // List of teams.
3179     kmp_team_p const *  team         // Team to add.
3180 ) {
3181 
3182     // List must terminate with item where both entry and next are NULL.
3183     // Team is added to the list only once.
3184     // List is sorted in ascending order by team id.
3185     // Team id is *not* a key.
3186 
3187     kmp_team_list_t l;
3188 
3189     KMP_DEBUG_ASSERT( list != NULL );
3190     if ( team == NULL ) {
3191         return;
3192     }; // if
3193 
3194     __kmp_print_structure_team_accum( list, team->t.t_parent );
3195     __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3196 
3197     // Search list for the team.
3198     l = list;
3199     while ( l->next != NULL && l->entry != team ) {
3200         l = l->next;
3201     }; // while
3202     if ( l->next != NULL ) {
3203         return;  // Team has been added before, exit.
3204     }; // if
3205 
3206     // Team is not found. Search list again for insertion point.
3207     l = list;
3208     while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3209         l = l->next;
3210     }; // while
3211 
3212     // Insert team.
3213     {
3214         kmp_team_list_item_t * item =
3215             (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof(  kmp_team_list_item_t ) );
3216         * item = * l;
3217         l->entry = team;
3218         l->next  = item;
3219     }
3220 
3221 }
3222 
3223 static void
3224 __kmp_print_structure_team(
3225     char const *       title,
3226     kmp_team_p const * team
3227 
3228 ) {
3229     __kmp_printf( "%s", title );
3230     if ( team != NULL ) {
3231         __kmp_printf( "%2x %p\n", team->t.t_id, team );
3232     } else {
3233         __kmp_printf( " - (nil)\n" );
3234     }; // if
3235 }
3236 
3237 static void
3238 __kmp_print_structure_thread(
3239     char const *       title,
3240     kmp_info_p const * thread
3241 
3242 ) {
3243     __kmp_printf( "%s", title );
3244     if ( thread != NULL ) {
3245         __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3246     } else {
3247         __kmp_printf( " - (nil)\n" );
3248     }; // if
3249 }
3250 
3251 void
3252 __kmp_print_structure(
3253     void
3254 ) {
3255 
3256     kmp_team_list_t list;
3257 
3258     // Initialize list of teams.
3259     list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3260     list->entry = NULL;
3261     list->next  = NULL;
3262 
3263     __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3264     {
3265         int gtid;
3266         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3267             __kmp_printf( "%2d", gtid );
3268             if ( __kmp_threads != NULL ) {
3269                 __kmp_printf( " %p", __kmp_threads[ gtid ] );
3270             }; // if
3271             if ( __kmp_root != NULL ) {
3272                 __kmp_printf( " %p", __kmp_root[ gtid ] );
3273             }; // if
3274             __kmp_printf( "\n" );
3275         }; // for gtid
3276     }
3277 
3278     // Print out __kmp_threads array.
3279     __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3280     if ( __kmp_threads != NULL ) {
3281         int gtid;
3282         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3283             kmp_info_t const * thread = __kmp_threads[ gtid ];
3284             if ( thread != NULL ) {
3285                 __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3286                 __kmp_printf(                 "    Our Root:        %p\n", thread->th.th_root );
3287                 __kmp_print_structure_team(   "    Our Team:     ",        thread->th.th_team );
3288                 __kmp_print_structure_team(   "    Serial Team:  ",        thread->th.th_serial_team );
3289                 __kmp_printf(                 "    Threads:      %2d\n",   thread->th.th_team_nproc );
3290                 __kmp_print_structure_thread( "    Master:       ",        thread->th.th_team_master );
3291                 __kmp_printf(                 "    Serialized?:  %2d\n",   thread->th.th_team_serialized );
3292                 __kmp_printf(                 "    Set NProc:    %2d\n",   thread->th.th_set_nproc );
3293 #if OMP_40_ENABLED
3294                 __kmp_printf(                 "    Set Proc Bind: %2d\n",  thread->th.th_set_proc_bind );
3295 #endif
3296                 __kmp_print_structure_thread( "    Next in pool: ",        thread->th.th_next_pool );
3297                 __kmp_printf( "\n" );
3298                 __kmp_print_structure_team_accum( list, thread->th.th_team );
3299                 __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3300             }; // if
3301         }; // for gtid
3302     } else {
3303         __kmp_printf( "Threads array is not allocated.\n" );
3304     }; // if
3305 
3306     // Print out __kmp_root array.
3307     __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3308     if ( __kmp_root != NULL ) {
3309         int gtid;
3310         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3311             kmp_root_t const * root = __kmp_root[ gtid ];
3312             if ( root != NULL ) {
3313                 __kmp_printf( "GTID %2d %p:\n", gtid, root );
3314                 __kmp_print_structure_team(   "    Root Team:    ",      root->r.r_root_team );
3315                 __kmp_print_structure_team(   "    Hot Team:     ",      root->r.r_hot_team );
3316                 __kmp_print_structure_thread( "    Uber Thread:  ",      root->r.r_uber_thread );
3317                 __kmp_printf(                 "    Active?:      %2d\n", root->r.r_active );
3318                 __kmp_printf(                 "    Nested?:      %2d\n", root->r.r_nested );
3319                 __kmp_printf(                 "    In Parallel:  %2d\n", root->r.r_in_parallel );
3320                 __kmp_printf( "\n" );
3321                 __kmp_print_structure_team_accum( list, root->r.r_root_team );
3322                 __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3323             }; // if
3324         }; // for gtid
3325     } else {
3326         __kmp_printf( "Ubers array is not allocated.\n" );
3327     }; // if
3328 
3329     __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3330     while ( list->next != NULL ) {
3331         kmp_team_p const * team = list->entry;
3332         int i;
3333         __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3334         __kmp_print_structure_team( "    Parent Team:      ",      team->t.t_parent );
3335         __kmp_printf(               "    Master TID:       %2d\n", team->t.t_master_tid );
3336         __kmp_printf(               "    Max threads:      %2d\n", team->t.t_max_nproc );
3337         __kmp_printf(               "    Levels of serial: %2d\n", team->t.t_serialized );
3338         __kmp_printf(               "    Number threads:   %2d\n", team->t.t_nproc );
3339         for ( i = 0; i < team->t.t_nproc; ++ i ) {
3340             __kmp_printf(           "    Thread %2d:      ", i );
3341             __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3342         }; // for i
3343         __kmp_print_structure_team( "    Next in pool:     ",      team->t.t_next_pool );
3344         __kmp_printf( "\n" );
3345         list = list->next;
3346     }; // while
3347 
3348     // Print out __kmp_thread_pool and __kmp_team_pool.
3349     __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3350     __kmp_print_structure_thread(   "Thread pool:          ", (kmp_info_t *)__kmp_thread_pool );
3351     __kmp_print_structure_team(     "Team pool:            ", (kmp_team_t *)__kmp_team_pool );
3352     __kmp_printf( "\n" );
3353 
3354     // Free team list.
3355     while ( list != NULL ) {
3356         kmp_team_list_item_t * item = list;
3357         list = list->next;
3358         KMP_INTERNAL_FREE( item );
3359     }; // while
3360 
3361 }
3362 
3363 #endif
3364 
3365 
3366 //---------------------------------------------------------------------------
3367 //  Stuff for per-thread fast random number generator
3368 //  Table of primes
3369 
3370 static const unsigned __kmp_primes[] = {
3371   0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3372   0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3373   0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3374   0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3375   0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3376   0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3377   0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3378   0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3379   0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3380   0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3381   0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3382   0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3383   0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3384   0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3385   0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3386   0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3387 };
3388 
3389 //---------------------------------------------------------------------------
3390 //  __kmp_get_random: Get a random number using a linear congruential method.
3391 
3392 unsigned short
3393 __kmp_get_random( kmp_info_t * thread )
3394 {
3395   unsigned x = thread->th.th_x;
3396   unsigned short r = x>>16;
3397 
3398   thread->th.th_x = x*thread->th.th_a+1;
3399 
3400   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3401          thread->th.th_info.ds.ds_tid, r) );
3402 
3403   return r;
3404 }
3405 //--------------------------------------------------------
3406 // __kmp_init_random: Initialize a random number generator
3407 
3408 void
3409 __kmp_init_random( kmp_info_t * thread )
3410 {
3411   unsigned seed = thread->th.th_info.ds.ds_tid;
3412 
3413   thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3414   thread->th.th_x = (seed+1)*thread->th.th_a+1;
3415   KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3416 }
3417 
3418 
3419 #if KMP_OS_WINDOWS
3420 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3421 static int
3422 __kmp_reclaim_dead_roots(void) {
3423     int i, r = 0;
3424 
3425     for(i = 0; i < __kmp_threads_capacity; ++i) {
3426         if( KMP_UBER_GTID( i ) &&
3427           !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3428           !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3429             r += __kmp_unregister_root_other_thread(i);
3430         }
3431     }
3432     return r;
3433 }
3434 #endif
3435 
3436 /*
3437    This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3438    free entries generated.
3439 
3440    For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3441    already dead.
3442 
3443    On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3444    update to __kmp_threads_capacity.  Array capacity is increased by doubling with clipping to
3445     __kmp_tp_capacity, if threadprivate cache array has been created.
3446    Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3447 
3448    After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3449    of a total of nWish free slots, the function does that expansion.  If not, but the clipping value allows
3450    array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3451    Otherwise, nothing is done beyond the possible initial root thread reclamation.  However, if nNeed is zero,
3452    a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3453    as many free slots as possible up to nWish.
3454 
3455    If any argument is negative, the behavior is undefined.
3456 */
3457 static int
3458 __kmp_expand_threads(int nWish, int nNeed) {
3459     int added = 0;
3460     int old_tp_cached;
3461     int __kmp_actual_max_nth;
3462 
3463     if(nNeed > nWish) /* normalize the arguments */
3464         nWish = nNeed;
3465 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3466 /* only for Windows static library */
3467     /* reclaim array entries for root threads that are already dead */
3468     added = __kmp_reclaim_dead_roots();
3469 
3470     if(nNeed) {
3471         nNeed -= added;
3472         if(nNeed < 0)
3473             nNeed = 0;
3474     }
3475     if(nWish) {
3476         nWish -= added;
3477         if(nWish < 0)
3478             nWish = 0;
3479     }
3480 #endif
3481     if(nWish <= 0)
3482         return added;
3483 
3484     while(1) {
3485         int nTarget;
3486         int minimumRequiredCapacity;
3487         int newCapacity;
3488         kmp_info_t **newThreads;
3489         kmp_root_t **newRoot;
3490 
3491         //
3492         // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3493         // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3494         // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3495         // become > __kmp_max_nth in one of two ways:
3496         //
3497         // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3498         //    may not be resused by another thread, so we may need to increase
3499         //    __kmp_threads_capacity to __kmp_max_threads + 1.
3500         //
3501         // 2) New foreign root(s) are encountered.  We always register new
3502         //    foreign roots.  This may cause a smaller # of threads to be
3503         //    allocated at subsequent parallel regions, but the worker threads
3504         //    hang around (and eventually go to sleep) and need slots in the
3505         //    __kmp_threads[] array.
3506         //
3507         // Anyway, that is the reason for moving the check to see if
3508         // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3509         // instead of having it performed here. -BB
3510         //
3511         old_tp_cached = __kmp_tp_cached;
3512         __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3513         KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3514 
3515         /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3516         nTarget = nWish;
3517         if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3518             /* can't fulfil nWish, so try nNeed */
3519             if(nNeed) {
3520                 nTarget = nNeed;
3521                 if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3522                     /* possible expansion too small -- give up */
3523                     break;
3524                 }
3525             } else {
3526                 /* best-effort */
3527                 nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3528                 if(!nTarget) {
3529                     /* can expand at all -- give up */
3530                     break;
3531                 }
3532             }
3533         }
3534         minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3535 
3536         newCapacity = __kmp_threads_capacity;
3537         do{
3538             newCapacity =
3539                 newCapacity <= (__kmp_actual_max_nth >> 1) ?
3540                 (newCapacity << 1) :
3541                 __kmp_actual_max_nth;
3542         } while(newCapacity < minimumRequiredCapacity);
3543         newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3544         newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3545         KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3546         KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3547         memset(newThreads + __kmp_threads_capacity, 0,
3548                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3549         memset(newRoot + __kmp_threads_capacity, 0,
3550                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3551 
3552         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3553             /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3554                while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3555                cache capacity, so we should deallocate the expanded arrays and try again.  This is the first check
3556                of a double-check pair.
3557             */
3558             __kmp_free(newThreads);
3559             continue; /* start over and try again */
3560         }
3561         __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3562         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3563             /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3564             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3565             __kmp_free(newThreads);
3566             continue; /* start over and try again */
3567         } else {
3568             /* success */
3569             // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3570             //
3571             *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3572             *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3573             added += newCapacity - __kmp_threads_capacity;
3574             *(volatile int*)&__kmp_threads_capacity = newCapacity;
3575             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3576             break; /* succeeded, so we can exit the loop */
3577         }
3578     }
3579     return added;
3580 }
3581 
3582 /* register the current thread as a root thread and obtain our gtid */
3583 /* we must have the __kmp_initz_lock held at this point */
3584 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3585 int
3586 __kmp_register_root( int initial_thread )
3587 {
3588     kmp_info_t *root_thread;
3589     kmp_root_t *root;
3590     int         gtid;
3591     int         capacity;
3592     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3593     KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3594     KMP_MB();
3595 
3596 
3597     /*
3598         2007-03-02:
3599 
3600         If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3601         "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3602         return false (that means there is at least one empty slot in __kmp_threads array), but it
3603         is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3604         used for this one. Following code workarounds this bug.
3605 
3606         However, right solution seems to be not reserving slot #0 for initial thread because:
3607             (1) there is no magic in slot #0,
3608             (2) we cannot detect initial thread reliably (the first thread which does serial
3609                 initialization may be not a real initial thread).
3610     */
3611     capacity = __kmp_threads_capacity;
3612     if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3613         -- capacity;
3614     }; // if
3615 
3616     /* see if there are too many threads */
3617     if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3618         if ( __kmp_tp_cached ) {
3619             __kmp_msg(
3620                 kmp_ms_fatal,
3621                 KMP_MSG( CantRegisterNewThread ),
3622                 KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3623                 KMP_HNT( PossibleSystemLimitOnThreads ),
3624                 __kmp_msg_null
3625             );
3626         }
3627         else {
3628             __kmp_msg(
3629                 kmp_ms_fatal,
3630                 KMP_MSG( CantRegisterNewThread ),
3631                 KMP_HNT( SystemLimitOnThreads ),
3632                 __kmp_msg_null
3633             );
3634         }
3635     }; // if
3636 
3637     /* find an available thread slot */
3638     /* Don't reassign the zero slot since we need that to only be used by initial
3639        thread */
3640     for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3641         ;
3642     KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3643     KMP_ASSERT( gtid < __kmp_threads_capacity );
3644 
3645     /* update global accounting */
3646     __kmp_all_nth ++;
3647     TCW_4(__kmp_nth, __kmp_nth + 1);
3648 
3649     //
3650     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3651     // for low numbers of procs, and method #2 (keyed API call) for higher
3652     // numbers of procs.
3653     //
3654     if ( __kmp_adjust_gtid_mode ) {
3655         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3656             if ( TCR_4(__kmp_gtid_mode) != 2) {
3657                 TCW_4(__kmp_gtid_mode, 2);
3658             }
3659         }
3660         else {
3661             if (TCR_4(__kmp_gtid_mode) != 1 ) {
3662                 TCW_4(__kmp_gtid_mode, 1);
3663             }
3664         }
3665     }
3666 
3667 #ifdef KMP_ADJUST_BLOCKTIME
3668     /* Adjust blocktime to zero if necessary            */
3669     /* Middle initialization might not have occurred yet */
3670     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3671         if ( __kmp_nth > __kmp_avail_proc ) {
3672             __kmp_zero_bt = TRUE;
3673         }
3674     }
3675 #endif /* KMP_ADJUST_BLOCKTIME */
3676 
3677     /* setup this new hierarchy */
3678     if( ! ( root = __kmp_root[gtid] )) {
3679         root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3680         KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3681     }
3682 
3683 #if KMP_STATS_ENABLED
3684     // Initialize stats as soon as possible (right after gtid assignment).
3685     __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3686     KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3687     KMP_SET_THREAD_STATE(SERIAL_REGION);
3688     KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3689 #endif
3690     __kmp_initialize_root( root );
3691 
3692     /* setup new root thread structure */
3693     if( root->r.r_uber_thread ) {
3694         root_thread = root->r.r_uber_thread;
3695     } else {
3696         root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3697         if ( __kmp_storage_map ) {
3698             __kmp_print_thread_storage_map( root_thread, gtid );
3699         }
3700         root_thread->th.th_info .ds.ds_gtid = gtid;
3701         root_thread->th.th_root =  root;
3702         if( __kmp_env_consistency_check ) {
3703             root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3704         }
3705         #if USE_FAST_MEMORY
3706             __kmp_initialize_fast_memory( root_thread );
3707         #endif /* USE_FAST_MEMORY */
3708 
3709         #if KMP_USE_BGET
3710             KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3711             __kmp_initialize_bget( root_thread );
3712         #endif
3713         __kmp_init_random( root_thread );  // Initialize random number generator
3714     }
3715 
3716     /* setup the serial team held in reserve by the root thread */
3717     if( ! root_thread->th.th_serial_team ) {
3718         kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3719         KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3720 
3721         root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3722 #if OMPT_SUPPORT
3723           0, // root parallel id
3724 #endif
3725 #if OMP_40_ENABLED
3726           proc_bind_default,
3727 #endif
3728           &r_icvs,
3729           0 USE_NESTED_HOT_ARG(NULL) );
3730     }
3731     KMP_ASSERT( root_thread->th.th_serial_team );
3732     KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3733       root_thread->th.th_serial_team ) );
3734 
3735     /* drop root_thread into place */
3736     TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3737 
3738     root->r.r_root_team->t.t_threads[0] = root_thread;
3739     root->r.r_hot_team ->t.t_threads[0] = root_thread;
3740     root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3741     root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3742     root->r.r_uber_thread = root_thread;
3743 
3744     /* initialize the thread, get it ready to go */
3745     __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3746     TCW_4(__kmp_init_gtid, TRUE);
3747 
3748     /* prepare the master thread for get_gtid() */
3749     __kmp_gtid_set_specific( gtid );
3750 
3751 #if USE_ITT_BUILD
3752     __kmp_itt_thread_name( gtid );
3753 #endif /* USE_ITT_BUILD */
3754 
3755     #ifdef KMP_TDATA_GTID
3756         __kmp_gtid = gtid;
3757     #endif
3758     __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3759     KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3760 
3761     KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3762                     gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3763                     root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3764                     KMP_INIT_BARRIER_STATE ) );
3765     { // Initialize barrier data.
3766         int b;
3767         for ( b = 0; b < bs_last_barrier; ++ b ) {
3768             root_thread->th.th_bar[ b ].bb.b_arrived        = KMP_INIT_BARRIER_STATE;
3769 #if USE_DEBUGGER
3770             root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3771 #endif
3772         }; // for
3773     }
3774     KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3775 
3776 #if KMP_AFFINITY_SUPPORTED
3777 # if OMP_40_ENABLED
3778     root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3779     root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3780     root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3781     root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3782 # endif
3783 
3784     if ( TCR_4(__kmp_init_middle) ) {
3785         __kmp_affinity_set_init_mask( gtid, TRUE );
3786     }
3787 #endif /* KMP_AFFINITY_SUPPORTED */
3788 
3789     __kmp_root_counter ++;
3790 
3791     KMP_MB();
3792     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3793 
3794     return gtid;
3795 }
3796 
3797 #if KMP_NESTED_HOT_TEAMS
3798 static int
3799 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3800 {
3801     int i, n, nth;
3802     kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3803     if( !hot_teams || !hot_teams[level].hot_team ) {
3804         return 0;
3805     }
3806     KMP_DEBUG_ASSERT( level < max_level );
3807     kmp_team_t *team = hot_teams[level].hot_team;
3808     nth = hot_teams[level].hot_team_nth;
3809     n = nth - 1;                   // master is not freed
3810     if( level < max_level - 1 ) {
3811         for( i = 0; i < nth; ++i ) {
3812             kmp_info_t *th = team->t.t_threads[i];
3813             n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3814             if( i > 0 && th->th.th_hot_teams ) {
3815                 __kmp_free( th->th.th_hot_teams );
3816                 th->th.th_hot_teams = NULL;
3817             }
3818         }
3819     }
3820     __kmp_free_team( root, team, NULL );
3821     return n;
3822 }
3823 #endif
3824 
3825 /* Resets a root thread and clear its root and hot teams.
3826    Returns the number of __kmp_threads entries directly and indirectly freed.
3827 */
3828 static int
3829 __kmp_reset_root(int gtid, kmp_root_t *root)
3830 {
3831     kmp_team_t * root_team = root->r.r_root_team;
3832     kmp_team_t * hot_team  = root->r.r_hot_team;
3833     int          n         = hot_team->t.t_nproc;
3834     int i;
3835 
3836     KMP_DEBUG_ASSERT( ! root->r.r_active );
3837 
3838     root->r.r_root_team = NULL;
3839     root->r.r_hot_team  = NULL;
3840         // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3841         // to __kmp_free_team().
3842     __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3843 #if KMP_NESTED_HOT_TEAMS
3844     if( __kmp_hot_teams_max_level > 0 ) {  // need to free nested hot teams and their threads if any
3845         for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3846             kmp_info_t *th = hot_team->t.t_threads[i];
3847             if( __kmp_hot_teams_max_level > 1 ) {
3848                 n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3849             }
3850             if( th->th.th_hot_teams ) {
3851                 __kmp_free( th->th.th_hot_teams );
3852                 th->th.th_hot_teams = NULL;
3853             }
3854         }
3855     }
3856 #endif
3857     __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3858 
3859     //
3860     // Before we can reap the thread, we need to make certain that all
3861     // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3862     //
3863     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3864         __kmp_wait_to_unref_task_teams();
3865     }
3866 
3867     #if KMP_OS_WINDOWS
3868         /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3869         KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3870             (LPVOID)&(root->r.r_uber_thread->th),
3871             root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3872         __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3873     #endif /* KMP_OS_WINDOWS */
3874 
3875 #if OMPT_SUPPORT
3876     if (ompt_enabled &&
3877         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3878         int gtid = __kmp_get_gtid();
3879         __ompt_thread_end(ompt_thread_initial, gtid);
3880     }
3881 #endif
3882 
3883     TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3884     __kmp_reap_thread( root->r.r_uber_thread, 1 );
3885 
3886         // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3887     root->r.r_uber_thread = NULL;
3888     /* mark root as no longer in use */
3889     root->r.r_begin = FALSE;
3890 
3891     return n;
3892 }
3893 
3894 void
3895 __kmp_unregister_root_current_thread( int gtid )
3896 {
3897     KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3898     /* this lock should be ok, since unregister_root_current_thread is never called during
3899      * and abort, only during a normal close.  furthermore, if you have the
3900      * forkjoin lock, you should never try to get the initz lock */
3901 
3902     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3903     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3904         KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3905         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3906         return;
3907     }
3908     kmp_root_t *root = __kmp_root[gtid];
3909 
3910     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3911     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3912     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3913     KMP_ASSERT( root->r.r_active == FALSE );
3914 
3915 
3916     KMP_MB();
3917 
3918 #if OMP_45_ENABLED
3919    kmp_info_t * thread = __kmp_threads[gtid];
3920    kmp_team_t * team = thread->th.th_team;
3921    kmp_task_team_t *   task_team = thread->th.th_task_team;
3922 
3923    // we need to wait for the proxy tasks before finishing the thread
3924    if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3925 #if OMPT_SUPPORT
3926         // the runtime is shutting down so we won't report any events
3927         thread->th.ompt_thread_info.state = ompt_state_undefined;
3928 #endif
3929         __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3930    }
3931 #endif
3932 
3933     __kmp_reset_root(gtid, root);
3934 
3935     /* free up this thread slot */
3936     __kmp_gtid_set_specific( KMP_GTID_DNE );
3937 #ifdef KMP_TDATA_GTID
3938     __kmp_gtid = KMP_GTID_DNE;
3939 #endif
3940 
3941     KMP_MB();
3942     KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3943 
3944     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3945 }
3946 
3947 #if KMP_OS_WINDOWS
3948 /* __kmp_forkjoin_lock must be already held
3949    Unregisters a root thread that is not the current thread.  Returns the number of
3950    __kmp_threads entries freed as a result.
3951  */
3952 static int
3953 __kmp_unregister_root_other_thread( int gtid )
3954 {
3955     kmp_root_t *root = __kmp_root[gtid];
3956     int r;
3957 
3958     KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3959     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3960     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3961     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3962     KMP_ASSERT( root->r.r_active == FALSE );
3963 
3964     r = __kmp_reset_root(gtid, root);
3965     KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3966     return r;
3967 }
3968 #endif
3969 
3970 #if KMP_DEBUG
3971 void __kmp_task_info() {
3972 
3973     kmp_int32 gtid       = __kmp_entry_gtid();
3974     kmp_int32 tid        = __kmp_tid_from_gtid( gtid );
3975     kmp_info_t *this_thr = __kmp_threads[ gtid ];
3976     kmp_team_t *steam    = this_thr->th.th_serial_team;
3977     kmp_team_t *team     = this_thr->th.th_team;
3978 
3979     __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3980         gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3981 }
3982 #endif // KMP_DEBUG
3983 
3984 /* TODO optimize with one big memclr, take out what isn't needed,
3985  * split responsibility to workers as much as possible, and delay
3986  * initialization of features as much as possible  */
3987 static void
3988 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3989 {
3990     /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3991      * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3992     kmp_info_t *master = team->t.t_threads[0];
3993     KMP_DEBUG_ASSERT( this_thr != NULL );
3994     KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3995     KMP_DEBUG_ASSERT( team );
3996     KMP_DEBUG_ASSERT( team->t.t_threads  );
3997     KMP_DEBUG_ASSERT( team->t.t_dispatch );
3998     KMP_DEBUG_ASSERT( master );
3999     KMP_DEBUG_ASSERT( master->th.th_root );
4000 
4001     KMP_MB();
4002 
4003     TCW_SYNC_PTR(this_thr->th.th_team, team);
4004 
4005     this_thr->th.th_info.ds.ds_tid  = tid;
4006     this_thr->th.th_set_nproc       = 0;
4007     if (__kmp_tasking_mode != tskm_immediate_exec)
4008         // When tasking is possible, threads are not safe to reap until they are
4009         // done tasking; this will be set when tasking code is exited in wait
4010         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4011     else  // no tasking --> always safe to reap
4012         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4013 #if OMP_40_ENABLED
4014     this_thr->th.th_set_proc_bind   = proc_bind_default;
4015 # if KMP_AFFINITY_SUPPORTED
4016     this_thr->th.th_new_place       = this_thr->th.th_current_place;
4017 # endif
4018 #endif
4019     this_thr->th.th_root            = master->th.th_root;
4020 
4021     /* setup the thread's cache of the team structure */
4022     this_thr->th.th_team_nproc      = team->t.t_nproc;
4023     this_thr->th.th_team_master     = master;
4024     this_thr->th.th_team_serialized = team->t.t_serialized;
4025     TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4026 
4027     KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4028 
4029     KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4030                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4031 
4032     __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4033 
4034     KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4035                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4036     // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4037 
4038     /* TODO no worksharing in speculative threads */
4039     this_thr->th.th_dispatch      = &team->t.t_dispatch[ tid ];
4040 
4041     this_thr->th.th_local.this_construct = 0;
4042 
4043 #ifdef BUILD_TV
4044     this_thr->th.th_local.tv_data = 0;
4045 #endif
4046 
4047     if ( ! this_thr->th.th_pri_common ) {
4048         this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4049         if ( __kmp_storage_map ) {
4050             __kmp_print_storage_map_gtid(
4051                 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4052                 sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4053             );
4054         }; // if
4055         this_thr->th.th_pri_head = NULL;
4056     }; // if
4057 
4058     /* Initialize dynamic dispatch */
4059     {
4060         volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4061         /*
4062          * Use team max_nproc since this will never change for the team.
4063          */
4064         size_t disp_size = sizeof( dispatch_private_info_t ) *
4065             ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4066         KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4067         KMP_ASSERT( dispatch );
4068         KMP_DEBUG_ASSERT( team->t.t_dispatch );
4069         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4070 
4071         dispatch->th_disp_index = 0;
4072 #if OMP_45_ENABLED
4073         dispatch->th_doacross_buf_idx = 0;
4074 #endif
4075         if( ! dispatch->th_disp_buffer )  {
4076             dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4077 
4078             if ( __kmp_storage_map ) {
4079                 __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4080                                          &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4081                                          disp_size, "th_%d.th_dispatch.th_disp_buffer "
4082                                          "(team_%d.t_dispatch[%d].th_disp_buffer)",
4083                                          gtid, team->t.t_id, gtid );
4084             }
4085         } else {
4086             memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4087         }
4088 
4089         dispatch->th_dispatch_pr_current = 0;
4090         dispatch->th_dispatch_sh_current = 0;
4091 
4092         dispatch->th_deo_fcn = 0;             /* ORDERED     */
4093         dispatch->th_dxo_fcn = 0;             /* END ORDERED */
4094     }
4095 
4096     this_thr->th.th_next_pool = NULL;
4097 
4098     if (!this_thr->th.th_task_state_memo_stack) {
4099         size_t i;
4100         this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4101         this_thr->th.th_task_state_top = 0;
4102         this_thr->th.th_task_state_stack_sz = 4;
4103         for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4104             this_thr->th.th_task_state_memo_stack[i] = 0;
4105     }
4106 
4107     KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4108     KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4109 
4110     KMP_MB();
4111 }
4112 
4113 
4114 /* allocate a new thread for the requesting team.  this is only called from within a
4115  * forkjoin critical section.  we will first try to get an available thread from the
4116  * thread pool.  if none is available, we will fork a new one assuming we are able
4117  * to create a new one.  this should be assured, as the caller should check on this
4118  * first.
4119  */
4120 kmp_info_t *
4121 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4122 {
4123     kmp_team_t  *serial_team;
4124     kmp_info_t  *new_thr;
4125     int          new_gtid;
4126 
4127     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4128     KMP_DEBUG_ASSERT( root && team );
4129 #if !KMP_NESTED_HOT_TEAMS
4130     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4131 #endif
4132     KMP_MB();
4133 
4134     /* first, try to get one from the thread pool */
4135     if ( __kmp_thread_pool ) {
4136 
4137         new_thr = (kmp_info_t*)__kmp_thread_pool;
4138         __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4139         if ( new_thr == __kmp_thread_pool_insert_pt ) {
4140             __kmp_thread_pool_insert_pt = NULL;
4141         }
4142         TCW_4(new_thr->th.th_in_pool, FALSE);
4143         //
4144         // Don't touch th_active_in_pool or th_active.
4145         // The worker thread adjusts those flags as it sleeps/awakens.
4146         //
4147         __kmp_thread_pool_nth--;
4148 
4149         KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4150                     __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4151         KMP_ASSERT(       ! new_thr->th.th_team );
4152         KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4153         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4154 
4155         /* setup the thread structure */
4156         __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4157         KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4158 
4159         TCW_4(__kmp_nth, __kmp_nth + 1);
4160 
4161         new_thr->th.th_task_state = 0;
4162         new_thr->th.th_task_state_top = 0;
4163         new_thr->th.th_task_state_stack_sz = 4;
4164 
4165 #ifdef KMP_ADJUST_BLOCKTIME
4166         /* Adjust blocktime back to zero if necessar      y */
4167         /* Middle initialization might not have occurred yet */
4168         if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4169             if ( __kmp_nth > __kmp_avail_proc ) {
4170                 __kmp_zero_bt = TRUE;
4171             }
4172         }
4173 #endif /* KMP_ADJUST_BLOCKTIME */
4174 
4175 #if KMP_DEBUG
4176         // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4177         int b;
4178         kmp_balign_t * balign = new_thr->th.th_bar;
4179         for( b = 0; b < bs_last_barrier; ++ b )
4180             KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4181 #endif
4182 
4183         KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4184                     __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4185 
4186         KMP_MB();
4187         return new_thr;
4188     }
4189 
4190 
4191     /* no, well fork a new one */
4192     KMP_ASSERT( __kmp_nth    == __kmp_all_nth );
4193     KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4194 
4195 #if KMP_USE_MONITOR
4196     //
4197     // If this is the first worker thread the RTL is creating, then also
4198     // launch the monitor thread.  We try to do this as early as possible.
4199     //
4200     if ( ! TCR_4( __kmp_init_monitor ) ) {
4201         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4202         if ( ! TCR_4( __kmp_init_monitor ) ) {
4203             KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4204             TCW_4( __kmp_init_monitor, 1 );
4205             __kmp_create_monitor( & __kmp_monitor );
4206             KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4207             #if KMP_OS_WINDOWS
4208                 // AC: wait until monitor has started. This is a fix for CQ232808.
4209                 //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4210                 //     work in between, then there is high probability that monitor thread started after
4211                 //     the library shutdown. At shutdown it is too late to cope with the problem, because
4212                 //     when the master is in DllMain (process detach) the monitor has no chances to start
4213                 //     (it is blocked), and master has no means to inform the monitor that the library has gone,
4214                 //     because all the memory which the monitor can access is going to be released/reset.
4215                 while ( TCR_4(__kmp_init_monitor) < 2 ) {
4216                     KMP_YIELD( TRUE );
4217                 }
4218                 KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4219             #endif
4220         }
4221         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4222     }
4223 #endif
4224 
4225     KMP_MB();
4226     for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4227         KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4228     }
4229 
4230     /* allocate space for it. */
4231     new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4232 
4233     TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4234 
4235     if ( __kmp_storage_map ) {
4236         __kmp_print_thread_storage_map( new_thr, new_gtid );
4237     }
4238 
4239     /* add the reserve serialized team, initialized from the team's master thread */
4240     {
4241     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4242     KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4243 
4244     new_thr->th.th_serial_team = serial_team =
4245         (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4246 #if OMPT_SUPPORT
4247                                            0, // root parallel id
4248 #endif
4249 #if OMP_40_ENABLED
4250                                            proc_bind_default,
4251 #endif
4252                                            &r_icvs,
4253                                            0 USE_NESTED_HOT_ARG(NULL) );
4254     }
4255     KMP_ASSERT ( serial_team );
4256     serial_team->t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
4257     serial_team->t.t_threads[0] = new_thr;
4258     KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4259       new_thr ) );
4260 
4261     /* setup the thread structures */
4262     __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4263 
4264     #if USE_FAST_MEMORY
4265         __kmp_initialize_fast_memory( new_thr );
4266     #endif /* USE_FAST_MEMORY */
4267 
4268     #if KMP_USE_BGET
4269         KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4270         __kmp_initialize_bget( new_thr );
4271     #endif
4272 
4273     __kmp_init_random( new_thr );  // Initialize random number generator
4274 
4275     /* Initialize these only once when thread is grabbed for a team allocation */
4276     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4277                     __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4278 
4279     int b;
4280     kmp_balign_t * balign = new_thr->th.th_bar;
4281     for(b=0; b<bs_last_barrier; ++b) {
4282         balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4283         balign[b].bb.team = NULL;
4284         balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4285         balign[b].bb.use_oncore_barrier = 0;
4286     }
4287 
4288     new_thr->th.th_spin_here = FALSE;
4289     new_thr->th.th_next_waiting = 0;
4290 
4291 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4292     new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4293     new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4294     new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4295     new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4296 #endif
4297 
4298     TCW_4(new_thr->th.th_in_pool, FALSE);
4299     new_thr->th.th_active_in_pool = FALSE;
4300     TCW_4(new_thr->th.th_active, TRUE);
4301 
4302     /* adjust the global counters */
4303     __kmp_all_nth ++;
4304     __kmp_nth ++;
4305 
4306     //
4307     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4308     // for low numbers of procs, and method #2 (keyed API call) for higher
4309     // numbers of procs.
4310     //
4311     if ( __kmp_adjust_gtid_mode ) {
4312         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4313             if ( TCR_4(__kmp_gtid_mode) != 2) {
4314                 TCW_4(__kmp_gtid_mode, 2);
4315             }
4316         }
4317         else {
4318             if (TCR_4(__kmp_gtid_mode) != 1 ) {
4319                 TCW_4(__kmp_gtid_mode, 1);
4320             }
4321         }
4322     }
4323 
4324 #ifdef KMP_ADJUST_BLOCKTIME
4325     /* Adjust blocktime back to zero if necessary       */
4326     /* Middle initialization might not have occurred yet */
4327     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4328         if ( __kmp_nth > __kmp_avail_proc ) {
4329             __kmp_zero_bt = TRUE;
4330         }
4331     }
4332 #endif /* KMP_ADJUST_BLOCKTIME */
4333 
4334     /* actually fork it and create the new worker thread */
4335     KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4336     __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4337     KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4338 
4339     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4340     KMP_MB();
4341     return new_thr;
4342 }
4343 
4344 /*
4345  * reinitialize team for reuse.
4346  *
4347  * The hot team code calls this case at every fork barrier, so EPCC barrier
4348  * test are extremely sensitive to changes in it, esp. writes to the team
4349  * struct, which cause a cache invalidation in all threads.
4350  *
4351  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4352  */
4353 static void
4354 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4355     KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4356                     team->t.t_threads[0], team ) );
4357     KMP_DEBUG_ASSERT( team && new_icvs);
4358     KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4359     KMP_CHECK_UPDATE(team->t.t_ident, loc);
4360 
4361     KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4362 
4363     // Copy ICVs to the master thread's implicit taskdata
4364     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4365     copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4366 
4367     KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4368                     team->t.t_threads[0], team ) );
4369 }
4370 
4371 
4372 /* initialize the team data structure
4373  * this assumes the t_threads and t_max_nproc are already set
4374  * also, we don't touch the arguments */
4375 static void
4376 __kmp_initialize_team(
4377     kmp_team_t * team,
4378     int          new_nproc,
4379     kmp_internal_control_t * new_icvs,
4380     ident_t *                loc
4381 ) {
4382     KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4383 
4384     /* verify */
4385     KMP_DEBUG_ASSERT( team );
4386     KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4387     KMP_DEBUG_ASSERT( team->t.t_threads );
4388     KMP_MB();
4389 
4390     team->t.t_master_tid  = 0;    /* not needed */
4391     /* team->t.t_master_bar;        not needed */
4392     team->t.t_serialized  = new_nproc > 1 ? 0 : 1;
4393     team->t.t_nproc       = new_nproc;
4394 
4395     /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4396     team->t.t_next_pool   = NULL;
4397     /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4398 
4399     TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4400     team->t.t_invoke      = NULL; /* not needed */
4401 
4402     // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4403     team->t.t_sched       = new_icvs->sched;
4404 
4405 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4406     team->t.t_fp_control_saved = FALSE; /* not needed */
4407     team->t.t_x87_fpu_control_word = 0; /* not needed */
4408     team->t.t_mxcsr = 0;                /* not needed */
4409 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4410 
4411     team->t.t_construct   = 0;
4412     __kmp_init_lock( & team->t.t_single_lock );
4413 
4414     team->t.t_ordered .dt.t_value = 0;
4415     team->t.t_master_active = FALSE;
4416 
4417     memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4418 
4419 #ifdef KMP_DEBUG
4420     team->t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
4421 #endif
4422     team->t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
4423 
4424     team->t.t_control_stack_top = NULL;
4425 
4426     __kmp_reinitialize_team( team, new_icvs, loc );
4427 
4428     KMP_MB();
4429     KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4430 }
4431 
4432 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4433 /* Sets full mask for thread and returns old mask, no changes to structures. */
4434 static void
4435 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4436 {
4437     if ( KMP_AFFINITY_CAPABLE() ) {
4438         int status;
4439         if ( old_mask != NULL ) {
4440             status = __kmp_get_system_affinity( old_mask, TRUE );
4441             int error = errno;
4442             if ( status != 0 ) {
4443                 __kmp_msg(
4444                     kmp_ms_fatal,
4445                     KMP_MSG( ChangeThreadAffMaskError ),
4446                     KMP_ERR( error ),
4447                     __kmp_msg_null
4448                 );
4449             }
4450         }
4451         __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4452     }
4453 }
4454 #endif
4455 
4456 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4457 
4458 //
4459 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4460 // It calculats the worker + master thread's partition based upon the parent
4461 // thread's partition, and binds each worker to a thread in their partition.
4462 // The master thread's partition should already include its current binding.
4463 //
4464 static void
4465 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4466 {
4467     //
4468     // Copy the master thread's place partion to the team struct
4469     //
4470     kmp_info_t *master_th = team->t.t_threads[0];
4471     KMP_DEBUG_ASSERT( master_th != NULL );
4472     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4473     int first_place = master_th->th.th_first_place;
4474     int last_place = master_th->th.th_last_place;
4475     int masters_place = master_th->th.th_current_place;
4476     team->t.t_first_place = first_place;
4477     team->t.t_last_place = last_place;
4478 
4479     KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4480        proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4481        masters_place, first_place, last_place ) );
4482 
4483     switch ( proc_bind ) {
4484 
4485         case proc_bind_default:
4486         //
4487         // serial teams might have the proc_bind policy set to
4488         // proc_bind_default.  It doesn't matter, as we don't
4489         // rebind the master thread for any proc_bind policy.
4490         //
4491         KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4492         break;
4493 
4494         case proc_bind_master:
4495         {
4496             int f;
4497             int n_th = team->t.t_nproc;
4498             for ( f = 1; f < n_th; f++ ) {
4499                 kmp_info_t *th = team->t.t_threads[f];
4500                 KMP_DEBUG_ASSERT( th != NULL );
4501                 th->th.th_first_place = first_place;
4502                 th->th.th_last_place = last_place;
4503                 th->th.th_new_place = masters_place;
4504 
4505                 KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4506                   __kmp_gtid_from_thread( team->t.t_threads[f] ),
4507                   team->t.t_id, f, masters_place, first_place, last_place ) );
4508             }
4509         }
4510         break;
4511 
4512         case proc_bind_close:
4513         {
4514             int f;
4515             int n_th = team->t.t_nproc;
4516             int n_places;
4517             if ( first_place <= last_place ) {
4518                 n_places = last_place - first_place + 1;
4519             }
4520             else {
4521                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4522             }
4523             if ( n_th <= n_places ) {
4524                 int place = masters_place;
4525                 for ( f = 1; f < n_th; f++ ) {
4526                     kmp_info_t *th = team->t.t_threads[f];
4527                     KMP_DEBUG_ASSERT( th != NULL );
4528 
4529                     if ( place == last_place ) {
4530                         place = first_place;
4531                     }
4532                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4533                         place = 0;
4534                     }
4535                     else {
4536                         place++;
4537                     }
4538                     th->th.th_first_place = first_place;
4539                     th->th.th_last_place = last_place;
4540                     th->th.th_new_place = place;
4541 
4542                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4543                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4544                        team->t.t_id, f, place, first_place, last_place ) );
4545                 }
4546             }
4547             else {
4548                 int S, rem, gap, s_count;
4549                 S = n_th / n_places;
4550                 s_count = 0;
4551                 rem = n_th - ( S * n_places );
4552                 gap = rem > 0 ? n_places/rem : n_places;
4553                 int place = masters_place;
4554                 int gap_ct = gap;
4555                 for ( f = 0; f < n_th; f++ ) {
4556                     kmp_info_t *th = team->t.t_threads[f];
4557                     KMP_DEBUG_ASSERT( th != NULL );
4558 
4559                     th->th.th_first_place = first_place;
4560                     th->th.th_last_place = last_place;
4561                     th->th.th_new_place = place;
4562                     s_count++;
4563 
4564                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4565                         // do nothing, add an extra thread to place on next iteration
4566                     }
4567                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4568                         // we added an extra thread to this place; move to next place
4569                         if ( place == last_place ) {
4570                             place = first_place;
4571                         }
4572                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4573                             place = 0;
4574                         }
4575                         else {
4576                             place++;
4577                         }
4578                         s_count = 0;
4579                         gap_ct = 1;
4580                         rem--;
4581                     }
4582                     else if (s_count == S) { // place full; don't add extra
4583                         if ( place == last_place ) {
4584                             place = first_place;
4585                         }
4586                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4587                             place = 0;
4588                         }
4589                         else {
4590                             place++;
4591                         }
4592                         gap_ct++;
4593                         s_count = 0;
4594                     }
4595 
4596                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4597                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4598                       team->t.t_id, f, th->th.th_new_place, first_place,
4599                       last_place ) );
4600                 }
4601                 KMP_DEBUG_ASSERT( place == masters_place );
4602             }
4603         }
4604         break;
4605 
4606         case proc_bind_spread:
4607         {
4608             int f;
4609             int n_th = team->t.t_nproc;
4610             int n_places;
4611             int thidx;
4612             if ( first_place <= last_place ) {
4613                 n_places = last_place - first_place + 1;
4614             }
4615             else {
4616                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4617             }
4618             if ( n_th <= n_places ) {
4619                 int place = masters_place;
4620                 int S = n_places/n_th;
4621                 int s_count, rem, gap, gap_ct;
4622                 rem = n_places - n_th*S;
4623                 gap = rem ? n_th/rem : 1;
4624                 gap_ct = gap;
4625                 thidx = n_th;
4626                 if (update_master_only == 1)
4627                     thidx = 1;
4628                 for ( f = 0; f < thidx; f++ ) {
4629                     kmp_info_t *th = team->t.t_threads[f];
4630                     KMP_DEBUG_ASSERT( th != NULL );
4631 
4632                     th->th.th_first_place = place;
4633                     th->th.th_new_place = place;
4634                     s_count = 1;
4635                     while (s_count < S) {
4636                         if ( place == last_place ) {
4637                             place = first_place;
4638                         }
4639                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4640                             place = 0;
4641                         }
4642                         else {
4643                             place++;
4644                         }
4645                         s_count++;
4646                     }
4647                     if (rem && (gap_ct == gap)) {
4648                         if ( place == last_place ) {
4649                             place = first_place;
4650                         }
4651                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4652                             place = 0;
4653                         }
4654                         else {
4655                             place++;
4656                         }
4657                         rem--;
4658                         gap_ct = 0;
4659                     }
4660                     th->th.th_last_place = place;
4661                     gap_ct++;
4662 
4663                     if ( place == last_place ) {
4664                         place = first_place;
4665                     }
4666                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4667                         place = 0;
4668                     }
4669                     else {
4670                         place++;
4671                     }
4672 
4673                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4674                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4675                       team->t.t_id, f, th->th.th_new_place,
4676                       th->th.th_first_place, th->th.th_last_place ) );
4677                 }
4678                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4679             }
4680             else {
4681                 int S, rem, gap, s_count;
4682                 S = n_th / n_places;
4683                 s_count = 0;
4684                 rem = n_th - ( S * n_places );
4685                 gap = rem > 0 ? n_places/rem : n_places;
4686                 int place = masters_place;
4687                 int gap_ct = gap;
4688                 thidx = n_th;
4689                 if (update_master_only == 1)
4690                     thidx = 1;
4691                 for ( f = 0; f < thidx; f++ ) {
4692                     kmp_info_t *th = team->t.t_threads[f];
4693                     KMP_DEBUG_ASSERT( th != NULL );
4694 
4695                     th->th.th_first_place = place;
4696                     th->th.th_last_place = place;
4697                     th->th.th_new_place = place;
4698                     s_count++;
4699 
4700                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4701                         // do nothing, add an extra thread to place on next iteration
4702                     }
4703                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4704                         // we added an extra thread to this place; move on to next place
4705                         if ( place == last_place ) {
4706                             place = first_place;
4707                         }
4708                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4709                             place = 0;
4710                         }
4711                         else {
4712                             place++;
4713                         }
4714                         s_count = 0;
4715                         gap_ct = 1;
4716                         rem--;
4717                     }
4718                     else if (s_count == S) { // place is full; don't add extra thread
4719                         if ( place == last_place ) {
4720                             place = first_place;
4721                         }
4722                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4723                             place = 0;
4724                         }
4725                         else {
4726                             place++;
4727                         }
4728                         gap_ct++;
4729                         s_count = 0;
4730                     }
4731 
4732                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4733                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4734                        team->t.t_id, f, th->th.th_new_place,
4735                        th->th.th_first_place, th->th.th_last_place) );
4736                 }
4737                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4738             }
4739         }
4740         break;
4741 
4742         default:
4743         break;
4744     }
4745 
4746     KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4747 }
4748 
4749 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4750 
4751 /* allocate a new team data structure to use.  take one off of the free pool if available */
4752 kmp_team_t *
4753 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4754 #if OMPT_SUPPORT
4755     ompt_parallel_id_t ompt_parallel_id,
4756 #endif
4757 #if OMP_40_ENABLED
4758     kmp_proc_bind_t new_proc_bind,
4759 #endif
4760     kmp_internal_control_t *new_icvs,
4761     int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4762 {
4763     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4764     int f;
4765     kmp_team_t *team;
4766     int use_hot_team = ! root->r.r_active;
4767     int level = 0;
4768 
4769     KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4770     KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4771     KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4772     KMP_MB();
4773 
4774 #if KMP_NESTED_HOT_TEAMS
4775     kmp_hot_team_ptr_t *hot_teams;
4776     if( master ) {
4777         team = master->th.th_team;
4778         level = team->t.t_active_level;
4779         if( master->th.th_teams_microtask ) {                         // in teams construct?
4780             if( master->th.th_teams_size.nteams > 1 && (             // #teams > 1
4781                 team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4782                 master->th.th_teams_level < team->t.t_level ) ) {    // or nested parallel inside the teams
4783                 ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4784             }
4785         }
4786         hot_teams = master->th.th_hot_teams;
4787         if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4788         {   // hot team has already been allocated for given level
4789             use_hot_team = 1;
4790         } else {
4791             use_hot_team = 0;
4792         }
4793     }
4794 #endif
4795     // Optimization to use a "hot" team
4796     if( use_hot_team && new_nproc > 1 ) {
4797         KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4798 #if KMP_NESTED_HOT_TEAMS
4799         team = hot_teams[level].hot_team;
4800 #else
4801         team =  root->r.r_hot_team;
4802 #endif
4803 #if KMP_DEBUG
4804         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4805             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4806                            team->t.t_task_team[0], team->t.t_task_team[1] ));
4807         }
4808 #endif
4809 
4810         // Has the number of threads changed?
4811         /* Let's assume the most common case is that the number of threads is unchanged, and
4812            put that case first. */
4813         if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4814             KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4815             // This case can mean that omp_set_num_threads() was called and the hot team size
4816             // was already reduced, so we check the special flag
4817             if ( team->t.t_size_changed == -1 ) {
4818                 team->t.t_size_changed = 1;
4819             } else {
4820                 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4821             }
4822 
4823             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4824             kmp_r_sched_t new_sched = new_icvs->sched;
4825             if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4826                 team->t.t_sched.chunk != new_sched.chunk)
4827                 team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4828 
4829             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4830 
4831             KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4832                            0, team->t.t_threads[0], team ) );
4833             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4834 
4835 #if OMP_40_ENABLED
4836 # if KMP_AFFINITY_SUPPORTED
4837             if ( ( team->t.t_size_changed == 0 )
4838               && ( team->t.t_proc_bind == new_proc_bind ) ) {
4839                 if (new_proc_bind == proc_bind_spread) {
4840                     __kmp_partition_places(team, 1); // add flag to update only master for spread
4841                 }
4842                 KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4843                   team->t.t_id, new_proc_bind, team->t.t_first_place,
4844                   team->t.t_last_place ) );
4845             }
4846             else {
4847                 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4848                 __kmp_partition_places( team );
4849             }
4850 # else
4851             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4852 # endif /* KMP_AFFINITY_SUPPORTED */
4853 #endif /* OMP_40_ENABLED */
4854         }
4855         else if( team->t.t_nproc > new_nproc ) {
4856             KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4857 
4858             team->t.t_size_changed = 1;
4859 #if KMP_NESTED_HOT_TEAMS
4860             if( __kmp_hot_teams_mode == 0 ) {
4861                 // AC: saved number of threads should correspond to team's value in this mode,
4862                 // can be bigger in mode 1, when hot team has some threads in reserve
4863                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4864                 hot_teams[level].hot_team_nth = new_nproc;
4865 #endif // KMP_NESTED_HOT_TEAMS
4866                 /* release the extra threads we don't need any more */
4867                 for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
4868                     KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4869                     if ( __kmp_tasking_mode != tskm_immediate_exec) {
4870                         // When decreasing team size, threads no longer in the team should unref task team.
4871                         team->t.t_threads[f]->th.th_task_team = NULL;
4872                     }
4873                     __kmp_free_thread( team->t.t_threads[ f ] );
4874                     team->t.t_threads[ f ] = NULL;
4875                 }
4876 #if KMP_NESTED_HOT_TEAMS
4877             } // (__kmp_hot_teams_mode == 0)
4878             else {
4879                 // When keeping extra threads in team, switch threads to wait on own b_go flag
4880                 for (f=new_nproc; f<team->t.t_nproc; ++f) {
4881                     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4882                     kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4883                     for (int b=0; b<bs_last_barrier; ++b) {
4884                         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4885                             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4886                         }
4887                         KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4888                     }
4889                 }
4890             }
4891 #endif // KMP_NESTED_HOT_TEAMS
4892             team->t.t_nproc =  new_nproc;
4893             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4894             if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4895                 team->t.t_sched.chunk != new_icvs->sched.chunk)
4896                 team->t.t_sched = new_icvs->sched;
4897             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4898 
4899             /* update the remaining threads */
4900             for(f = 0; f < new_nproc; ++f) {
4901                 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4902             }
4903             // restore the current task state of the master thread: should be the implicit task
4904             KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4905                        0, team->t.t_threads[0], team ) );
4906 
4907             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4908 
4909 #ifdef KMP_DEBUG
4910             for ( f = 0; f < team->t.t_nproc; f++ ) {
4911                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4912                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4913             }
4914 #endif
4915 
4916 #if OMP_40_ENABLED
4917             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4918 # if KMP_AFFINITY_SUPPORTED
4919             __kmp_partition_places( team );
4920 # endif
4921 #endif
4922         }
4923         else { // team->t.t_nproc < new_nproc
4924 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4925             kmp_affin_mask_t *old_mask;
4926             if ( KMP_AFFINITY_CAPABLE() ) {
4927                 KMP_CPU_ALLOC(old_mask);
4928             }
4929 #endif
4930 
4931             KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4932 
4933             team->t.t_size_changed = 1;
4934 
4935 #if KMP_NESTED_HOT_TEAMS
4936             int avail_threads = hot_teams[level].hot_team_nth;
4937             if( new_nproc < avail_threads )
4938                 avail_threads = new_nproc;
4939             kmp_info_t **other_threads = team->t.t_threads;
4940             for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4941                 // Adjust barrier data of reserved threads (if any) of the team
4942                 // Other data will be set in __kmp_initialize_info() below.
4943                 int b;
4944                 kmp_balign_t * balign = other_threads[f]->th.th_bar;
4945                 for ( b = 0; b < bs_last_barrier; ++ b ) {
4946                     balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4947                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4948 #if USE_DEBUGGER
4949                     balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4950 #endif
4951                 }
4952             }
4953             if( hot_teams[level].hot_team_nth >= new_nproc ) {
4954                 // we have all needed threads in reserve, no need to allocate any
4955                 // this only possible in mode 1, cannot have reserved threads in mode 0
4956                 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4957                 team->t.t_nproc = new_nproc;                     // just get reserved threads involved
4958             } else {
4959                 // we may have some threads in reserve, but not enough
4960                 team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4961                 hot_teams[level].hot_team_nth = new_nproc;       // adjust hot team max size
4962 #endif // KMP_NESTED_HOT_TEAMS
4963             if(team->t.t_max_nproc < new_nproc) {
4964                 /* reallocate larger arrays */
4965                 __kmp_reallocate_team_arrays(team, new_nproc);
4966                 __kmp_reinitialize_team( team, new_icvs, NULL );
4967             }
4968 
4969 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4970             /* Temporarily set full mask for master thread before
4971                creation of workers. The reason is that workers inherit
4972                the affinity from master, so if a lot of workers are
4973                created on the single core quickly, they don't get
4974                a chance to set their own affinity for a long time.
4975             */
4976             __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4977 #endif
4978 
4979             /* allocate new threads for the hot team */
4980             for( f = team->t.t_nproc  ;  f < new_nproc  ;  f++ ) {
4981                 kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4982                 KMP_DEBUG_ASSERT( new_worker );
4983                 team->t.t_threads[ f ] = new_worker;
4984 
4985                 KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4986                                 team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4987                                 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4988                                 team->t.t_bar[bs_plain_barrier].b_arrived ) );
4989 
4990                 { // Initialize barrier data for new threads.
4991                     int b;
4992                     kmp_balign_t * balign = new_worker->th.th_bar;
4993                     for( b = 0; b < bs_last_barrier; ++ b ) {
4994                         balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
4995                         KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4996 #if USE_DEBUGGER
4997                         balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4998 #endif
4999                     }
5000                 }
5001             }
5002 
5003 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5004             if ( KMP_AFFINITY_CAPABLE() ) {
5005                 /* Restore initial master thread's affinity mask */
5006                 __kmp_set_system_affinity( old_mask, TRUE );
5007                 KMP_CPU_FREE(old_mask);
5008             }
5009 #endif
5010 #if KMP_NESTED_HOT_TEAMS
5011             } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5012 #endif // KMP_NESTED_HOT_TEAMS
5013             /* make sure everyone is syncronized */
5014             int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
5015             __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5016 
5017             /* reinitialize the threads */
5018             KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5019             for (f=0;  f < team->t.t_nproc; ++f)
5020                 __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5021             if (level) { // set th_task_state for new threads in nested hot team
5022                 // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
5023                 // th_task_state for the new threads. th_task_state for master thread will not be accurate until
5024                 // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
5025                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5026                     team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5027             }
5028             else { // set th_task_state for new threads in non-nested hot team
5029                 int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5030                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5031                     team->t.t_threads[f]->th.th_task_state = old_state;
5032             }
5033 
5034 #ifdef KMP_DEBUG
5035             for ( f = 0; f < team->t.t_nproc; ++ f ) {
5036                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5037                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5038             }
5039 #endif
5040 
5041 #if OMP_40_ENABLED
5042             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5043 # if KMP_AFFINITY_SUPPORTED
5044             __kmp_partition_places( team );
5045 # endif
5046 #endif
5047         } // Check changes in number of threads
5048 
5049 #if OMP_40_ENABLED
5050         kmp_info_t *master = team->t.t_threads[0];
5051         if( master->th.th_teams_microtask ) {
5052             for( f = 1; f < new_nproc; ++f ) {
5053                 // propagate teams construct specific info to workers
5054                 kmp_info_t *thr = team->t.t_threads[f];
5055                 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5056                 thr->th.th_teams_level     = master->th.th_teams_level;
5057                 thr->th.th_teams_size      = master->th.th_teams_size;
5058             }
5059         }
5060 #endif /* OMP_40_ENABLED */
5061 #if KMP_NESTED_HOT_TEAMS
5062         if( level ) {
5063             // Sync barrier state for nested hot teams, not needed for outermost hot team.
5064             for( f = 1; f < new_nproc; ++f ) {
5065                 kmp_info_t *thr = team->t.t_threads[f];
5066                 int b;
5067                 kmp_balign_t * balign = thr->th.th_bar;
5068                 for( b = 0; b < bs_last_barrier; ++ b ) {
5069                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
5070                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5071 #if USE_DEBUGGER
5072                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5073 #endif
5074                 }
5075             }
5076         }
5077 #endif // KMP_NESTED_HOT_TEAMS
5078 
5079         /* reallocate space for arguments if necessary */
5080         __kmp_alloc_argv_entries( argc, team, TRUE );
5081         KMP_CHECK_UPDATE(team->t.t_argc, argc);
5082         //
5083         // The hot team re-uses the previous task team,
5084         // if untouched during the previous release->gather phase.
5085         //
5086 
5087         KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5088 
5089 #if KMP_DEBUG
5090         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5091             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5092                            team->t.t_task_team[0], team->t.t_task_team[1] ));
5093         }
5094 #endif
5095 
5096 #if OMPT_SUPPORT
5097         __ompt_team_assign_id(team, ompt_parallel_id);
5098 #endif
5099 
5100         KMP_MB();
5101 
5102         return team;
5103     }
5104 
5105     /* next, let's try to take one from the team pool */
5106     KMP_MB();
5107     for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5108     {
5109         /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5110         if ( team->t.t_max_nproc >= max_nproc ) {
5111             /* take this team from the team pool */
5112             __kmp_team_pool = team->t.t_next_pool;
5113 
5114             /* setup the team for fresh use */
5115             __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5116 
5117             KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5118                             &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5119             team->t.t_task_team[0] = NULL;
5120             team->t.t_task_team[1] = NULL;
5121 
5122             /* reallocate space for arguments if necessary */
5123             __kmp_alloc_argv_entries( argc, team, TRUE );
5124             KMP_CHECK_UPDATE(team->t.t_argc, argc);
5125 
5126             KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5127                             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5128             { // Initialize barrier data.
5129                 int b;
5130                 for ( b = 0; b < bs_last_barrier; ++ b) {
5131                     team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5132 #if USE_DEBUGGER
5133                     team->t.t_bar[ b ].b_master_arrived = 0;
5134                     team->t.t_bar[ b ].b_team_arrived   = 0;
5135 #endif
5136                 }
5137             }
5138 
5139 #if OMP_40_ENABLED
5140             team->t.t_proc_bind = new_proc_bind;
5141 #endif
5142 
5143             KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5144 
5145 #if OMPT_SUPPORT
5146             __ompt_team_assign_id(team, ompt_parallel_id);
5147 #endif
5148 
5149             KMP_MB();
5150 
5151             return team;
5152         }
5153 
5154         /* reap team if it is too small, then loop back and check the next one */
5155         /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5156         /* TODO: Use technique to find the right size hot-team, don't reap them */
5157         team =  __kmp_reap_team( team );
5158         __kmp_team_pool = team;
5159     }
5160 
5161     /* nothing available in the pool, no matter, make a new team! */
5162     KMP_MB();
5163     team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5164 
5165     /* and set it up */
5166     team->t.t_max_nproc   = max_nproc;
5167     /* NOTE well, for some reason allocating one big buffer and dividing it
5168      * up seems to really hurt performance a lot on the P4, so, let's not use
5169      * this... */
5170     __kmp_allocate_team_arrays( team, max_nproc );
5171 
5172     KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5173     __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5174 
5175     KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5176                     &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5177     team->t.t_task_team[0] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5178     team->t.t_task_team[1] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5179 
5180     if ( __kmp_storage_map ) {
5181         __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5182     }
5183 
5184     /* allocate space for arguments */
5185     __kmp_alloc_argv_entries( argc, team, FALSE );
5186     team->t.t_argc        = argc;
5187 
5188     KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5189                     team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5190     { // Initialize barrier data.
5191         int b;
5192         for ( b = 0; b < bs_last_barrier; ++ b ) {
5193             team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5194 #if USE_DEBUGGER
5195             team->t.t_bar[ b ].b_master_arrived = 0;
5196             team->t.t_bar[ b ].b_team_arrived   = 0;
5197 #endif
5198         }
5199     }
5200 
5201 #if OMP_40_ENABLED
5202     team->t.t_proc_bind = new_proc_bind;
5203 #endif
5204 
5205 #if OMPT_SUPPORT
5206     __ompt_team_assign_id(team, ompt_parallel_id);
5207     team->t.ompt_serialized_team_info = NULL;
5208 #endif
5209 
5210     KMP_MB();
5211 
5212     KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5213 
5214     return team;
5215 }
5216 
5217 /* TODO implement hot-teams at all levels */
5218 /* TODO implement lazy thread release on demand (disband request) */
5219 
5220 /* free the team.  return it to the team pool.  release all the threads
5221  * associated with it */
5222 void
5223 __kmp_free_team( kmp_root_t *root, kmp_team_t *team  USE_NESTED_HOT_ARG(kmp_info_t *master) )
5224 {
5225     int f;
5226     KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5227 
5228     /* verify state */
5229     KMP_DEBUG_ASSERT( root );
5230     KMP_DEBUG_ASSERT( team );
5231     KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5232     KMP_DEBUG_ASSERT( team->t.t_threads );
5233 
5234     int use_hot_team = team == root->r.r_hot_team;
5235 #if KMP_NESTED_HOT_TEAMS
5236     int level;
5237     kmp_hot_team_ptr_t *hot_teams;
5238     if( master ) {
5239         level = team->t.t_active_level - 1;
5240         if( master->th.th_teams_microtask ) {                         // in teams construct?
5241             if( master->th.th_teams_size.nteams > 1 ) {
5242                ++level; // level was not increased in teams construct for team_of_masters
5243             }
5244             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5245                 master->th.th_teams_level == team->t.t_level ) {
5246                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
5247             }            // team->t.t_level will be increased inside parallel
5248         }
5249         hot_teams = master->th.th_hot_teams;
5250         if( level < __kmp_hot_teams_max_level ) {
5251             KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5252             use_hot_team = 1;
5253         }
5254     }
5255 #endif // KMP_NESTED_HOT_TEAMS
5256 
5257     /* team is done working */
5258     TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5259     team->t.t_copyin_counter = 0; // init counter for possible reuse
5260     // Do not reset pointer to parent team to NULL for hot teams.
5261 
5262     /* if we are non-hot team, release our threads */
5263     if( ! use_hot_team ) {
5264         if (__kmp_tasking_mode != tskm_immediate_exec) {
5265             // Wait for threads to reach reapable state
5266             for (f = 1; f < team->t.t_nproc; ++f) {
5267                 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5268                 volatile kmp_uint32 *state = &team->t.t_threads[f]->th.th_reap_state;
5269                 while (*state != KMP_SAFE_TO_REAP) {
5270 #if KMP_OS_WINDOWS
5271                     // On Windows a thread can be killed at any time, check this
5272                     DWORD ecode;
5273                     if (__kmp_is_thread_alive(team->t.t_threads[f], &ecode))
5274                         KMP_CPU_PAUSE();
5275                     else
5276                         *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5277 #else
5278                     KMP_CPU_PAUSE();
5279 #endif
5280                 }
5281             }
5282 
5283             // Delete task teams
5284             int tt_idx;
5285             for (tt_idx=0; tt_idx<2; ++tt_idx) {
5286                 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5287                 if ( task_team != NULL ) {
5288                     for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5289                         team->t.t_threads[f]->th.th_task_team = NULL;
5290                     }
5291                     KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5292 #if KMP_NESTED_HOT_TEAMS
5293                     __kmp_free_task_team( master, task_team );
5294 #endif
5295                     team->t.t_task_team[tt_idx] = NULL;
5296                 }
5297             }
5298         }
5299 
5300         // Reset pointer to parent team only for non-hot teams.
5301         team->t.t_parent = NULL;
5302         team->t.t_level = 0;
5303         team->t.t_active_level = 0;
5304 
5305         /* free the worker threads */
5306         for ( f = 1; f < team->t.t_nproc; ++ f ) {
5307             KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5308             __kmp_free_thread( team->t.t_threads[ f ] );
5309             team->t.t_threads[ f ] = NULL;
5310         }
5311 
5312         /* put the team back in the team pool */
5313         /* TODO limit size of team pool, call reap_team if pool too large */
5314         team->t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
5315         __kmp_team_pool        = (volatile kmp_team_t*) team;
5316     }
5317 
5318     KMP_MB();
5319 }
5320 
5321 
5322 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5323 kmp_team_t *
5324 __kmp_reap_team( kmp_team_t *team )
5325 {
5326     kmp_team_t *next_pool = team->t.t_next_pool;
5327 
5328     KMP_DEBUG_ASSERT( team );
5329     KMP_DEBUG_ASSERT( team->t.t_dispatch    );
5330     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5331     KMP_DEBUG_ASSERT( team->t.t_threads     );
5332     KMP_DEBUG_ASSERT( team->t.t_argv        );
5333 
5334     /* TODO clean the threads that are a part of this? */
5335 
5336     /* free stuff */
5337 
5338     __kmp_free_team_arrays( team );
5339     if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5340         __kmp_free( (void*) team->t.t_argv );
5341     __kmp_free( team );
5342 
5343     KMP_MB();
5344     return next_pool;
5345 }
5346 
5347 //
5348 // Free the thread.  Don't reap it, just place it on the pool of available
5349 // threads.
5350 //
5351 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5352 // binding for the affinity mechanism to be useful.
5353 //
5354 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5355 // However, we want to avoid a potential performance problem by always
5356 // scanning through the list to find the correct point at which to insert
5357 // the thread (potential N**2 behavior).  To do this we keep track of the
5358 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5359 // With single-level parallelism, threads will always be added to the tail
5360 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5361 // parallelism, all bets are off and we may need to scan through the entire
5362 // free list.
5363 //
5364 // This change also has a potentially large performance benefit, for some
5365 // applications.  Previously, as threads were freed from the hot team, they
5366 // would be placed back on the free list in inverse order.  If the hot team
5367 // grew back to it's original size, then the freed thread would be placed
5368 // back on the hot team in reverse order.  This could cause bad cache
5369 // locality problems on programs where the size of the hot team regularly
5370 // grew and shrunk.
5371 //
5372 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5373 //
5374 void
5375 __kmp_free_thread( kmp_info_t *this_th )
5376 {
5377     int gtid;
5378     kmp_info_t **scan;
5379 
5380     KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5381                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5382 
5383     KMP_DEBUG_ASSERT( this_th );
5384 
5385     // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5386     int b;
5387     kmp_balign_t *balign = this_th->th.th_bar;
5388     for (b=0; b<bs_last_barrier; ++b) {
5389         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5390             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5391         balign[b].bb.team = NULL;
5392         balign[b].bb.leaf_kids = 0;
5393     }
5394     this_th->th.th_task_state = 0;
5395 
5396     /* put thread back on the free pool */
5397     TCW_PTR(this_th->th.th_team, NULL);
5398     TCW_PTR(this_th->th.th_root, NULL);
5399     TCW_PTR(this_th->th.th_dispatch, NULL);               /* NOT NEEDED */
5400 
5401     //
5402     // If the __kmp_thread_pool_insert_pt is already past the new insert
5403     // point, then we need to re-scan the entire list.
5404     //
5405     gtid = this_th->th.th_info.ds.ds_gtid;
5406     if ( __kmp_thread_pool_insert_pt != NULL ) {
5407         KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5408         if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5409              __kmp_thread_pool_insert_pt = NULL;
5410         }
5411     }
5412 
5413     //
5414     // Scan down the list to find the place to insert the thread.
5415     // scan is the address of a link in the list, possibly the address of
5416     // __kmp_thread_pool itself.
5417     //
5418     // In the absence of nested parallism, the for loop will have 0 iterations.
5419     //
5420     if ( __kmp_thread_pool_insert_pt != NULL ) {
5421         scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5422     }
5423     else {
5424         scan = (kmp_info_t **)&__kmp_thread_pool;
5425     }
5426     for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5427       scan = &( (*scan)->th.th_next_pool ) );
5428 
5429     //
5430     // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5431     // to its address.
5432     //
5433     TCW_PTR(this_th->th.th_next_pool, *scan);
5434     __kmp_thread_pool_insert_pt = *scan = this_th;
5435     KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5436       || ( this_th->th.th_info.ds.ds_gtid
5437       < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5438     TCW_4(this_th->th.th_in_pool, TRUE);
5439     __kmp_thread_pool_nth++;
5440 
5441     TCW_4(__kmp_nth, __kmp_nth - 1);
5442 
5443 #ifdef KMP_ADJUST_BLOCKTIME
5444     /* Adjust blocktime back to user setting or default if necessary */
5445     /* Middle initialization might never have occurred                */
5446     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5447         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5448         if ( __kmp_nth <= __kmp_avail_proc ) {
5449             __kmp_zero_bt = FALSE;
5450         }
5451     }
5452 #endif /* KMP_ADJUST_BLOCKTIME */
5453 
5454     KMP_MB();
5455 }
5456 
5457 
5458 /* ------------------------------------------------------------------------ */
5459 
5460 void *
5461 __kmp_launch_thread( kmp_info_t *this_thr )
5462 {
5463     int                   gtid = this_thr->th.th_info.ds.ds_gtid;
5464 /*    void                 *stack_data;*/
5465     kmp_team_t *(*volatile pteam);
5466 
5467     KMP_MB();
5468     KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5469 
5470     if( __kmp_env_consistency_check ) {
5471         this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
5472     }
5473 
5474 #if OMPT_SUPPORT
5475     if (ompt_enabled) {
5476         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5477         this_thr->th.ompt_thread_info.wait_id = 0;
5478         this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5479         if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5480             __ompt_thread_begin(ompt_thread_worker, gtid);
5481         }
5482     }
5483 #endif
5484 
5485     /* This is the place where threads wait for work */
5486     while( ! TCR_4(__kmp_global.g.g_done) ) {
5487         KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5488         KMP_MB();
5489 
5490         /* wait for work to do */
5491         KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5492 
5493 #if OMPT_SUPPORT
5494         if (ompt_enabled) {
5495             this_thr->th.ompt_thread_info.state = ompt_state_idle;
5496         }
5497 #endif
5498 
5499         /* No tid yet since not part of a team */
5500         __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5501 
5502 #if OMPT_SUPPORT
5503         if (ompt_enabled) {
5504             this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5505         }
5506 #endif
5507 
5508         pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5509 
5510         /* have we been allocated? */
5511         if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5512 #if OMPT_SUPPORT
5513             ompt_task_info_t *task_info;
5514             ompt_parallel_id_t my_parallel_id;
5515             if (ompt_enabled) {
5516                 task_info = __ompt_get_taskinfo(0);
5517                 my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5518             }
5519 #endif
5520             /* we were just woken up, so run our new task */
5521             if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5522                 int rc;
5523                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5524                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5525 
5526                 updateHWFPControl (*pteam);
5527 
5528 #if OMPT_SUPPORT
5529                 if (ompt_enabled) {
5530                     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5531                     // Initialize OMPT task id for implicit task.
5532                     int tid = __kmp_tid_from_gtid(gtid);
5533                     task_info->task_id = __ompt_task_id_new(tid);
5534                 }
5535 #endif
5536 
5537                 {
5538                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5539                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5540                     rc = (*pteam)->t.t_invoke( gtid );
5541                 }
5542                 KMP_ASSERT( rc );
5543 
5544 #if OMPT_SUPPORT
5545                 if (ompt_enabled) {
5546                     /* no frame set while outside task */
5547                     task_info->frame.exit_runtime_frame = NULL;
5548 
5549                     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5550                 }
5551 #endif
5552                 KMP_MB();
5553                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5554                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5555             }
5556             /* join barrier after parallel region */
5557             __kmp_join_barrier( gtid );
5558 #if OMPT_SUPPORT && OMPT_TRACE
5559             if (ompt_enabled) {
5560                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5561                     // don't access *pteam here: it may have already been freed
5562                     // by the master thread behind the barrier (possible race)
5563                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5564                         my_parallel_id, task_info->task_id);
5565                 }
5566                 task_info->frame.exit_runtime_frame = NULL;
5567                 task_info->task_id = 0;
5568             }
5569 #endif
5570         }
5571     }
5572     TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5573 
5574 #if OMPT_SUPPORT
5575     if (ompt_enabled &&
5576         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5577         __ompt_thread_end(ompt_thread_worker, gtid);
5578     }
5579 #endif
5580 
5581     this_thr->th.th_task_team = NULL;
5582     /* run the destructors for the threadprivate data for this thread */
5583     __kmp_common_destroy_gtid( gtid );
5584 
5585     KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5586     KMP_MB();
5587     return this_thr;
5588 }
5589 
5590 /* ------------------------------------------------------------------------ */
5591 /* ------------------------------------------------------------------------ */
5592 
5593 void
5594 __kmp_internal_end_dest( void *specific_gtid )
5595 {
5596     #if KMP_COMPILER_ICC
5597         #pragma warning( push )
5598         #pragma warning( disable:  810 ) // conversion from "void *" to "int" may lose significant bits
5599     #endif
5600     // Make sure no significant bits are lost
5601     int gtid = (kmp_intptr_t)specific_gtid - 1;
5602     #if KMP_COMPILER_ICC
5603         #pragma warning( pop )
5604     #endif
5605 
5606     KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5607     /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5608      * this is because 0 is reserved for the nothing-stored case */
5609 
5610     /* josh: One reason for setting the gtid specific data even when it is being
5611        destroyed by pthread is to allow gtid lookup through thread specific data
5612        (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5613        that gets executed in the call to __kmp_internal_end_thread, actually
5614        gets the gtid through the thread specific data.  Setting it here seems
5615        rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5616        to run smoothly.
5617        todo: get rid of this after we remove the dependence on
5618        __kmp_gtid_get_specific
5619     */
5620     if(gtid >= 0 && KMP_UBER_GTID(gtid))
5621         __kmp_gtid_set_specific( gtid );
5622     #ifdef KMP_TDATA_GTID
5623         __kmp_gtid = gtid;
5624     #endif
5625     __kmp_internal_end_thread( gtid );
5626 }
5627 
5628 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5629 
5630 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5631 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5632 // option in makefile.mk works fine.
5633 
5634 __attribute__(( destructor ))
5635 void
5636 __kmp_internal_end_dtor( void )
5637 {
5638     __kmp_internal_end_atexit();
5639 }
5640 
5641 void
5642 __kmp_internal_end_fini( void )
5643 {
5644     __kmp_internal_end_atexit();
5645 }
5646 
5647 #endif
5648 
5649 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5650 void
5651 __kmp_internal_end_atexit( void )
5652 {
5653     KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5654     /* [Windows]
5655        josh: ideally, we want to completely shutdown the library in this atexit handler, but
5656        stat code that depends on thread specific data for gtid fails because that data becomes
5657        unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5658        instead.  We should eventually remove the dependency on __kmp_get_specific_gtid in the
5659        stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5660 
5661 // TODO: Can some of this comment about GVS be removed?
5662        I suspect that the offending stat code is executed when the calling thread tries to
5663        clean up a dead root thread's data structures, resulting in GVS code trying to close
5664        the GVS structures for that thread, but since the stat code uses
5665        __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5666        cleaning up itself instead of another thread, it gets confused.  This happens because
5667        allowing a thread to unregister and cleanup another thread is a recent modification for
5668        addressing an issue with Maxon Cinema4D.  Based on the current design (20050722), a
5669        thread may end up trying to unregister another thread only if thread death does not
5670        trigger the calling of __kmp_internal_end_thread.  For Linux* OS, there is the thread
5671        specific data destructor function to detect thread death.  For Windows dynamic, there
5672        is DllMain(THREAD_DETACH).  For Windows static, there is nothing.  Thus, the
5673        workaround is applicable only for Windows static stat library.
5674     */
5675     __kmp_internal_end_library( -1 );
5676     #if KMP_OS_WINDOWS
5677         __kmp_close_console();
5678     #endif
5679 }
5680 
5681 static void
5682 __kmp_reap_thread(
5683     kmp_info_t * thread,
5684     int is_root
5685 ) {
5686 
5687     // It is assumed __kmp_forkjoin_lock is acquired.
5688 
5689     int gtid;
5690 
5691     KMP_DEBUG_ASSERT( thread != NULL );
5692 
5693     gtid = thread->th.th_info.ds.ds_gtid;
5694 
5695     if ( ! is_root ) {
5696 
5697         if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5698             /* Assume the threads are at the fork barrier here */
5699             KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5700             /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5701             ANNOTATE_HAPPENS_BEFORE(thread);
5702             kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5703             __kmp_release_64(&flag);
5704         }; // if
5705 
5706         // Terminate OS thread.
5707         __kmp_reap_worker( thread );
5708 
5709         //
5710         // The thread was killed asynchronously.  If it was actively
5711         // spinning in the thread pool, decrement the global count.
5712         //
5713         // There is a small timing hole here - if the worker thread was
5714         // just waking up after sleeping in the pool, had reset it's
5715         // th_active_in_pool flag but not decremented the global counter
5716         // __kmp_thread_pool_active_nth yet, then the global counter
5717         // might not get updated.
5718         //
5719         // Currently, this can only happen as the library is unloaded,
5720         // so there are no harmful side effects.
5721         //
5722         if ( thread->th.th_active_in_pool ) {
5723             thread->th.th_active_in_pool = FALSE;
5724             KMP_TEST_THEN_DEC32(
5725               (kmp_int32 *) &__kmp_thread_pool_active_nth );
5726             KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5727         }
5728 
5729         // Decrement # of [worker] threads in the pool.
5730         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5731         --__kmp_thread_pool_nth;
5732     }; // if
5733 
5734     __kmp_free_implicit_task(thread);
5735 
5736     // Free the fast memory for tasking
5737     #if USE_FAST_MEMORY
5738         __kmp_free_fast_memory( thread );
5739     #endif /* USE_FAST_MEMORY */
5740 
5741     __kmp_suspend_uninitialize_thread( thread );
5742 
5743     KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5744     TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5745 
5746     -- __kmp_all_nth;
5747     // __kmp_nth was decremented when thread is added to the pool.
5748 
5749 #ifdef KMP_ADJUST_BLOCKTIME
5750     /* Adjust blocktime back to user setting or default if necessary */
5751     /* Middle initialization might never have occurred                */
5752     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5753         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5754         if ( __kmp_nth <= __kmp_avail_proc ) {
5755             __kmp_zero_bt = FALSE;
5756         }
5757     }
5758 #endif /* KMP_ADJUST_BLOCKTIME */
5759 
5760     /* free the memory being used */
5761     if( __kmp_env_consistency_check ) {
5762         if ( thread->th.th_cons ) {
5763             __kmp_free_cons_stack( thread->th.th_cons );
5764             thread->th.th_cons = NULL;
5765         }; // if
5766     }
5767 
5768     if ( thread->th.th_pri_common != NULL ) {
5769         __kmp_free( thread->th.th_pri_common );
5770         thread->th.th_pri_common = NULL;
5771     }; // if
5772 
5773     if (thread->th.th_task_state_memo_stack != NULL) {
5774         __kmp_free(thread->th.th_task_state_memo_stack);
5775         thread->th.th_task_state_memo_stack = NULL;
5776     }
5777 
5778     #if KMP_USE_BGET
5779         if ( thread->th.th_local.bget_data != NULL ) {
5780             __kmp_finalize_bget( thread );
5781         }; // if
5782     #endif
5783 
5784 #if KMP_AFFINITY_SUPPORTED
5785     if ( thread->th.th_affin_mask != NULL ) {
5786         KMP_CPU_FREE( thread->th.th_affin_mask );
5787         thread->th.th_affin_mask = NULL;
5788     }; // if
5789 #endif /* KMP_AFFINITY_SUPPORTED */
5790 
5791     __kmp_reap_team( thread->th.th_serial_team );
5792     thread->th.th_serial_team = NULL;
5793     __kmp_free( thread );
5794 
5795     KMP_MB();
5796 
5797 } // __kmp_reap_thread
5798 
5799 static void
5800 __kmp_internal_end(void)
5801 {
5802     int i;
5803 
5804     /* First, unregister the library */
5805     __kmp_unregister_library();
5806 
5807     #if KMP_OS_WINDOWS
5808         /* In Win static library, we can't tell when a root actually dies, so we
5809            reclaim the data structures for any root threads that have died but not
5810            unregistered themselves, in order to shut down cleanly.
5811            In Win dynamic library we also can't tell when a thread dies.
5812         */
5813         __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5814     #endif
5815 
5816     for( i=0 ; i<__kmp_threads_capacity ; i++ )
5817         if( __kmp_root[i] )
5818             if( __kmp_root[i]->r.r_active )
5819                 break;
5820     KMP_MB();       /* Flush all pending memory write invalidates.  */
5821     TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5822 
5823     if ( i < __kmp_threads_capacity ) {
5824 #if KMP_USE_MONITOR
5825         // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5826         KMP_MB();       /* Flush all pending memory write invalidates.  */
5827 
5828         //
5829         // Need to check that monitor was initialized before reaping it.
5830         // If we are called form __kmp_atfork_child (which sets
5831         // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5832         // contain valid data, but it is only valid in the parent process,
5833         // not the child.
5834         //
5835         // New behavior (201008): instead of keying off of the flag
5836         // __kmp_init_parallel, the monitor thread creation is keyed off
5837         // of the new flag __kmp_init_monitor.
5838         //
5839         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5840         if ( TCR_4( __kmp_init_monitor ) ) {
5841             __kmp_reap_monitor( & __kmp_monitor );
5842             TCW_4( __kmp_init_monitor, 0 );
5843         }
5844         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5845         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5846 #endif // KMP_USE_MONITOR
5847     } else {
5848         /* TODO move this to cleanup code */
5849         #ifdef KMP_DEBUG
5850             /* make sure that everything has properly ended */
5851             for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5852                 if( __kmp_root[i] ) {
5853 //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
5854                     KMP_ASSERT( ! __kmp_root[i]->r.r_active );  // TODO: can they be active?
5855                 }
5856             }
5857         #endif
5858 
5859         KMP_MB();
5860 
5861         // Reap the worker threads.
5862         // This is valid for now, but be careful if threads are reaped sooner.
5863         while ( __kmp_thread_pool != NULL ) {    // Loop thru all the thread in the pool.
5864             // Get the next thread from the pool.
5865             kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5866             __kmp_thread_pool = thread->th.th_next_pool;
5867             // Reap it.
5868             KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5869             thread->th.th_next_pool = NULL;
5870             thread->th.th_in_pool = FALSE;
5871             __kmp_reap_thread( thread, 0 );
5872         }; // while
5873         __kmp_thread_pool_insert_pt = NULL;
5874 
5875         // Reap teams.
5876         while ( __kmp_team_pool != NULL ) {     // Loop thru all the teams in the pool.
5877             // Get the next team from the pool.
5878             kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5879             __kmp_team_pool = team->t.t_next_pool;
5880             // Reap it.
5881             team->t.t_next_pool = NULL;
5882             __kmp_reap_team( team );
5883         }; // while
5884 
5885         __kmp_reap_task_teams( );
5886 
5887         for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5888             // TBD: Add some checking...
5889             // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5890         }
5891 
5892         /* Make sure all threadprivate destructors get run by joining with all worker
5893            threads before resetting this flag */
5894         TCW_SYNC_4(__kmp_init_common, FALSE);
5895 
5896         KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5897         KMP_MB();
5898 
5899 #if KMP_USE_MONITOR
5900         //
5901         // See note above: One of the possible fixes for CQ138434 / CQ140126
5902         //
5903         // FIXME: push both code fragments down and CSE them?
5904         // push them into __kmp_cleanup() ?
5905         //
5906         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5907         if ( TCR_4( __kmp_init_monitor ) ) {
5908             __kmp_reap_monitor( & __kmp_monitor );
5909             TCW_4( __kmp_init_monitor, 0 );
5910         }
5911         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5912         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5913 #endif
5914     } /* else !__kmp_global.t_active */
5915     TCW_4(__kmp_init_gtid, FALSE);
5916     KMP_MB();       /* Flush all pending memory write invalidates.  */
5917 
5918     __kmp_cleanup();
5919 #if OMPT_SUPPORT
5920     ompt_fini();
5921 #endif
5922 }
5923 
5924 void
5925 __kmp_internal_end_library( int gtid_req )
5926 {
5927     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5928     /* this shouldn't be a race condition because __kmp_internal_end() is the
5929      * only place to clear __kmp_serial_init */
5930     /* we'll check this later too, after we get the lock */
5931     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5932     // because the next check will work in any case.
5933     if( __kmp_global.g.g_abort ) {
5934         KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5935         /* TODO abort? */
5936         return;
5937     }
5938     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5939         KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5940         return;
5941     }
5942 
5943 
5944     KMP_MB();       /* Flush all pending memory write invalidates.  */
5945 
5946     /* find out who we are and what we should do */
5947     {
5948         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5949         KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req ));
5950         if( gtid == KMP_GTID_SHUTDOWN ) {
5951             KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5952             return;
5953         } else if( gtid == KMP_GTID_MONITOR ) {
5954             KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5955             return;
5956         } else if( gtid == KMP_GTID_DNE ) {
5957             KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5958             /* we don't know who we are, but we may still shutdown the library */
5959         } else if( KMP_UBER_GTID( gtid )) {
5960             /* unregister ourselves as an uber thread.  gtid is no longer valid */
5961             if( __kmp_root[gtid]->r.r_active ) {
5962                 __kmp_global.g.g_abort = -1;
5963                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5964                 KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5965                 return;
5966             } else {
5967                 KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5968                 __kmp_unregister_root_current_thread( gtid );
5969             }
5970         } else {
5971             /* worker threads may call this function through the atexit handler, if they call exit() */
5972             /* For now, skip the usual subsequent processing and just dump the debug buffer.
5973                TODO: do a thorough shutdown instead
5974             */
5975             #ifdef DUMP_DEBUG_ON_EXIT
5976                 if ( __kmp_debug_buf )
5977                     __kmp_dump_debug_buffer( );
5978             #endif
5979             return;
5980         }
5981     }
5982     /* synchronize the termination process */
5983     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5984 
5985     /* have we already finished */
5986     if( __kmp_global.g.g_abort ) {
5987         KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5988         /* TODO abort? */
5989         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5990         return;
5991     }
5992     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5993         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5994         return;
5995     }
5996 
5997     /* We need this lock to enforce mutex between this reading of
5998        __kmp_threads_capacity and the writing by __kmp_register_root.
5999        Alternatively, we can use a counter of roots that is
6000        atomically updated by __kmp_get_global_thread_id_reg,
6001        __kmp_do_serial_initialize and __kmp_internal_end_*.
6002     */
6003     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6004 
6005     /* now we can safely conduct the actual termination */
6006     __kmp_internal_end();
6007 
6008     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6009     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6010 
6011     KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
6012 
6013     #ifdef DUMP_DEBUG_ON_EXIT
6014         if ( __kmp_debug_buf )
6015             __kmp_dump_debug_buffer();
6016     #endif
6017 
6018     #if KMP_OS_WINDOWS
6019         __kmp_close_console();
6020     #endif
6021 
6022     __kmp_fini_allocator();
6023 
6024 } // __kmp_internal_end_library
6025 
6026 void
6027 __kmp_internal_end_thread( int gtid_req )
6028 {
6029     int i;
6030 
6031     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6032     /* this shouldn't be a race condition because __kmp_internal_end() is the
6033      * only place to clear __kmp_serial_init */
6034     /* we'll check this later too, after we get the lock */
6035     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6036     // because the next check will work in any case.
6037     if( __kmp_global.g.g_abort ) {
6038         KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6039         /* TODO abort? */
6040         return;
6041     }
6042     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6043         KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6044         return;
6045     }
6046 
6047     KMP_MB();       /* Flush all pending memory write invalidates.  */
6048 
6049     /* find out who we are and what we should do */
6050     {
6051         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6052         KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req ));
6053         if( gtid == KMP_GTID_SHUTDOWN ) {
6054             KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6055             return;
6056         } else if( gtid == KMP_GTID_MONITOR ) {
6057             KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6058             return;
6059         } else if( gtid == KMP_GTID_DNE ) {
6060             KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6061             return;
6062             /* we don't know who we are */
6063         } else if( KMP_UBER_GTID( gtid )) {
6064         /* unregister ourselves as an uber thread.  gtid is no longer valid */
6065             if( __kmp_root[gtid]->r.r_active ) {
6066                 __kmp_global.g.g_abort = -1;
6067                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6068                 KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6069                 return;
6070             } else {
6071                 KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6072                 __kmp_unregister_root_current_thread( gtid );
6073             }
6074         } else {
6075             /* just a worker thread, let's leave */
6076             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6077 
6078             if ( gtid >= 0 ) {
6079                 __kmp_threads[gtid]->th.th_task_team = NULL;
6080             }
6081 
6082             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6083             return;
6084         }
6085     }
6086     #if defined KMP_DYNAMIC_LIB
6087     // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6088     //     because we will better shutdown later in the library destructor.
6089     //     The reason of this change is performance problem when non-openmp thread
6090     //     in a loop forks and joins many openmp threads. We can save a lot of time
6091     //     keeping worker threads alive until the program shutdown.
6092     // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6093     //     Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6094         KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6095         return;
6096     #endif
6097     /* synchronize the termination process */
6098     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6099 
6100     /* have we already finished */
6101     if( __kmp_global.g.g_abort ) {
6102         KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6103         /* TODO abort? */
6104         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6105         return;
6106     }
6107     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6108         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6109         return;
6110     }
6111 
6112     /* We need this lock to enforce mutex between this reading of
6113        __kmp_threads_capacity and the writing by __kmp_register_root.
6114        Alternatively, we can use a counter of roots that is
6115        atomically updated by __kmp_get_global_thread_id_reg,
6116        __kmp_do_serial_initialize and __kmp_internal_end_*.
6117     */
6118 
6119     /* should we finish the run-time?  are all siblings done? */
6120     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6121 
6122     for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6123         if ( KMP_UBER_GTID( i ) ) {
6124             KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6125             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6126             __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6127             return;
6128         };
6129     }
6130 
6131     /* now we can safely conduct the actual termination */
6132 
6133     __kmp_internal_end();
6134 
6135     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6136     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6137 
6138     KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6139 
6140     #ifdef DUMP_DEBUG_ON_EXIT
6141         if ( __kmp_debug_buf )
6142             __kmp_dump_debug_buffer();
6143     #endif
6144 } // __kmp_internal_end_thread
6145 
6146 // -------------------------------------------------------------------------------------------------
6147 // Library registration stuff.
6148 
6149 static long   __kmp_registration_flag = 0;
6150     // Random value used to indicate library initialization.
6151 static char * __kmp_registration_str  = NULL;
6152     // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6153 
6154 
6155 static inline
6156 char *
6157 __kmp_reg_status_name() {
6158     /*
6159         On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6160         If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6161         the name of registered_lib_env env var can not be found, because the name will contain different pid.
6162     */
6163     return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6164 } // __kmp_reg_status_get
6165 
6166 
6167 void
6168 __kmp_register_library_startup(
6169     void
6170 ) {
6171 
6172     char * name   = __kmp_reg_status_name();  // Name of the environment variable.
6173     int    done   = 0;
6174     union {
6175         double dtime;
6176         long   ltime;
6177     } time;
6178     #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6179         __kmp_initialize_system_tick();
6180     #endif
6181     __kmp_read_system_time( & time.dtime );
6182     __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6183     __kmp_registration_str =
6184         __kmp_str_format(
6185             "%p-%lx-%s",
6186             & __kmp_registration_flag,
6187             __kmp_registration_flag,
6188             KMP_LIBRARY_FILE
6189         );
6190 
6191     KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6192 
6193     while ( ! done ) {
6194 
6195         char * value  = NULL; // Actual value of the environment variable.
6196 
6197         // Set environment variable, but do not overwrite if it is exist.
6198         __kmp_env_set( name, __kmp_registration_str, 0 );
6199         // Check the variable is written.
6200         value = __kmp_env_get( name );
6201         if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6202 
6203             done = 1;    // Ok, environment variable set successfully, exit the loop.
6204 
6205         } else {
6206 
6207             // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6208             // Check whether it alive or dead.
6209             int    neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6210             char * tail          = value;
6211             char * flag_addr_str = NULL;
6212             char * flag_val_str  = NULL;
6213             char const * file_name     = NULL;
6214             __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6215             __kmp_str_split( tail, '-', & flag_val_str,  & tail );
6216             file_name = tail;
6217             if ( tail != NULL ) {
6218                 long * flag_addr = 0;
6219                 long   flag_val  = 0;
6220                 KMP_SSCANF( flag_addr_str, "%p",  & flag_addr );
6221                 KMP_SSCANF( flag_val_str,  "%lx", & flag_val  );
6222                 if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6223                     // First, check whether environment-encoded address is mapped into addr space.
6224                     // If so, dereference it to see if it still has the right value.
6225 
6226                     if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6227                         neighbor = 1;
6228                     } else {
6229                         // If not, then we know the other copy of the library is no longer running.
6230                         neighbor = 2;
6231                     }; // if
6232                 }; // if
6233             }; // if
6234             switch ( neighbor ) {
6235                 case 0 :      // Cannot parse environment variable -- neighbor status unknown.
6236                     // Assume it is the incompatible format of future version of the library.
6237                     // Assume the other library is alive.
6238                     // WARN( ... ); // TODO: Issue a warning.
6239                     file_name = "unknown library";
6240                     // Attention! Falling to the next case. That's intentional.
6241                 case 1 : {    // Neighbor is alive.
6242                     // Check it is allowed.
6243                     char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6244                     if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6245                         // That's not allowed. Issue fatal error.
6246                         __kmp_msg(
6247                             kmp_ms_fatal,
6248                             KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6249                             KMP_HNT( DuplicateLibrary ),
6250                             __kmp_msg_null
6251                         );
6252                     }; // if
6253                     KMP_INTERNAL_FREE( duplicate_ok );
6254                     __kmp_duplicate_library_ok = 1;
6255                     done = 1;    // Exit the loop.
6256                 } break;
6257                 case 2 : {    // Neighbor is dead.
6258                     // Clear the variable and try to register library again.
6259                     __kmp_env_unset( name );
6260                 }  break;
6261                 default : {
6262                     KMP_DEBUG_ASSERT( 0 );
6263                 } break;
6264             }; // switch
6265 
6266         }; // if
6267         KMP_INTERNAL_FREE( (void *) value );
6268 
6269     }; // while
6270     KMP_INTERNAL_FREE( (void *) name );
6271 
6272 } // func __kmp_register_library_startup
6273 
6274 
6275 void
6276 __kmp_unregister_library( void ) {
6277 
6278     char * name  = __kmp_reg_status_name();
6279     char * value = __kmp_env_get( name );
6280 
6281     KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6282     KMP_DEBUG_ASSERT( __kmp_registration_str  != NULL );
6283     if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6284         // Ok, this is our variable. Delete it.
6285         __kmp_env_unset( name );
6286     }; // if
6287 
6288     KMP_INTERNAL_FREE( __kmp_registration_str );
6289     KMP_INTERNAL_FREE( value );
6290     KMP_INTERNAL_FREE( name );
6291 
6292     __kmp_registration_flag = 0;
6293     __kmp_registration_str  = NULL;
6294 
6295 } // __kmp_unregister_library
6296 
6297 
6298 // End of Library registration stuff.
6299 // -------------------------------------------------------------------------------------------------
6300 
6301 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6302 
6303 static void __kmp_check_mic_type()
6304 {
6305     kmp_cpuid_t cpuid_state = {0};
6306     kmp_cpuid_t * cs_p = &cpuid_state;
6307     __kmp_x86_cpuid(1, 0, cs_p);
6308     // We don't support mic1 at the moment
6309     if( (cs_p->eax & 0xff0) == 0xB10 ) {
6310         __kmp_mic_type = mic2;
6311     } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6312         __kmp_mic_type = mic3;
6313     } else {
6314         __kmp_mic_type = non_mic;
6315     }
6316 }
6317 
6318 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6319 
6320 static void
6321 __kmp_do_serial_initialize( void )
6322 {
6323     int i, gtid;
6324     int size;
6325 
6326     KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6327 
6328     KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6329     KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6330     KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6331     KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6332     KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6333 
6334 #if OMPT_SUPPORT
6335     ompt_pre_init();
6336 #endif
6337 
6338     __kmp_validate_locks();
6339 
6340     /* Initialize internal memory allocator */
6341     __kmp_init_allocator();
6342 
6343     /* Register the library startup via an environment variable
6344        and check to see whether another copy of the library is already
6345        registered. */
6346 
6347     __kmp_register_library_startup( );
6348 
6349     /* TODO reinitialization of library */
6350     if( TCR_4(__kmp_global.g.g_done) ) {
6351        KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6352     }
6353 
6354     __kmp_global.g.g_abort = 0;
6355     TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6356 
6357     /* initialize the locks */
6358 #if KMP_USE_ADAPTIVE_LOCKS
6359 #if KMP_DEBUG_ADAPTIVE_LOCKS
6360     __kmp_init_speculative_stats();
6361 #endif
6362 #endif
6363 #if KMP_STATS_ENABLED
6364     __kmp_stats_init();
6365 #endif
6366     __kmp_init_lock( & __kmp_global_lock     );
6367     __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6368     __kmp_init_lock( & __kmp_debug_lock      );
6369     __kmp_init_atomic_lock( & __kmp_atomic_lock     );
6370     __kmp_init_atomic_lock( & __kmp_atomic_lock_1i  );
6371     __kmp_init_atomic_lock( & __kmp_atomic_lock_2i  );
6372     __kmp_init_atomic_lock( & __kmp_atomic_lock_4i  );
6373     __kmp_init_atomic_lock( & __kmp_atomic_lock_4r  );
6374     __kmp_init_atomic_lock( & __kmp_atomic_lock_8i  );
6375     __kmp_init_atomic_lock( & __kmp_atomic_lock_8r  );
6376     __kmp_init_atomic_lock( & __kmp_atomic_lock_8c  );
6377     __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6378     __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6379     __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6380     __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6381     __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6382     __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock  );
6383     __kmp_init_bootstrap_lock( & __kmp_exit_lock      );
6384 #if KMP_USE_MONITOR
6385     __kmp_init_bootstrap_lock( & __kmp_monitor_lock   );
6386 #endif
6387     __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6388 
6389     /* conduct initialization and initial setup of configuration */
6390 
6391     __kmp_runtime_initialize();
6392 
6393 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6394     __kmp_check_mic_type();
6395 #endif
6396 
6397     // Some global variable initialization moved here from kmp_env_initialize()
6398 #ifdef KMP_DEBUG
6399     kmp_diag = 0;
6400 #endif
6401     __kmp_abort_delay = 0;
6402 
6403     // From __kmp_init_dflt_team_nth()
6404     /* assume the entire machine will be used */
6405     __kmp_dflt_team_nth_ub = __kmp_xproc;
6406     if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6407         __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6408     }
6409     if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6410         __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6411     }
6412     __kmp_max_nth = __kmp_sys_max_nth;
6413 
6414     // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6415     __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6416 #if KMP_USE_MONITOR
6417     __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6418     __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6419 #endif
6420     // From "KMP_LIBRARY" part of __kmp_env_initialize()
6421     __kmp_library = library_throughput;
6422     // From KMP_SCHEDULE initialization
6423     __kmp_static = kmp_sch_static_balanced;
6424     // AC: do not use analytical here, because it is non-monotonous
6425     //__kmp_guided = kmp_sch_guided_iterative_chunked;
6426     //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6427     // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6428     // control parts
6429     #if KMP_FAST_REDUCTION_BARRIER
6430         #define kmp_reduction_barrier_gather_bb ((int)1)
6431         #define kmp_reduction_barrier_release_bb ((int)1)
6432         #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6433         #define kmp_reduction_barrier_release_pat bp_hyper_bar
6434     #endif // KMP_FAST_REDUCTION_BARRIER
6435     for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6436         __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6437         __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6438         __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6439         __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6440         #if KMP_FAST_REDUCTION_BARRIER
6441         if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6442             __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6443             __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6444             __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6445             __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6446         }
6447         #endif // KMP_FAST_REDUCTION_BARRIER
6448     }
6449     #if KMP_FAST_REDUCTION_BARRIER
6450         #undef kmp_reduction_barrier_release_pat
6451         #undef kmp_reduction_barrier_gather_pat
6452         #undef kmp_reduction_barrier_release_bb
6453         #undef kmp_reduction_barrier_gather_bb
6454     #endif // KMP_FAST_REDUCTION_BARRIER
6455 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6456     if (__kmp_mic_type == mic2) { // KNC
6457         // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6458         __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3;  // plain gather
6459         __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1;  // forkjoin release
6460         __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6461         __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6462     }
6463 #if KMP_FAST_REDUCTION_BARRIER
6464     if (__kmp_mic_type == mic2) { // KNC
6465         __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6466         __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6467     }
6468 #endif
6469 #endif
6470 
6471     // From KMP_CHECKS initialization
6472 #ifdef KMP_DEBUG
6473     __kmp_env_checks = TRUE;   /* development versions have the extra checks */
6474 #else
6475     __kmp_env_checks = FALSE;  /* port versions do not have the extra checks */
6476 #endif
6477 
6478     // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6479     __kmp_foreign_tp = TRUE;
6480 
6481     __kmp_global.g.g_dynamic = FALSE;
6482     __kmp_global.g.g_dynamic_mode = dynamic_default;
6483 
6484     __kmp_env_initialize( NULL );
6485 
6486     // Print all messages in message catalog for testing purposes.
6487     #ifdef KMP_DEBUG
6488         char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6489         if ( __kmp_str_match_true( val ) ) {
6490             kmp_str_buf_t buffer;
6491             __kmp_str_buf_init( & buffer );
6492             __kmp_i18n_dump_catalog( & buffer );
6493             __kmp_printf( "%s", buffer.str );
6494             __kmp_str_buf_free( & buffer );
6495         }; // if
6496         __kmp_env_free( & val );
6497     #endif
6498 
6499     __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6500     // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6501     __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6502 
6503     // If the library is shut down properly, both pools must be NULL. Just in case, set them
6504     // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6505     KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6506     KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6507     KMP_DEBUG_ASSERT( __kmp_team_pool   == NULL );
6508     __kmp_thread_pool = NULL;
6509     __kmp_thread_pool_insert_pt = NULL;
6510     __kmp_team_pool   = NULL;
6511 
6512     /* Allocate all of the variable sized records */
6513     /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6514     /* Since allocation is cache-aligned, just add extra padding at the end */
6515     size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6516     __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6517     __kmp_root    = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6518 
6519     /* init thread counts */
6520     KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6521     KMP_DEBUG_ASSERT( __kmp_nth == 0 );     // something was wrong in termination.
6522     __kmp_all_nth = 0;
6523     __kmp_nth     = 0;
6524 
6525     /* setup the uber master thread and hierarchy */
6526     gtid = __kmp_register_root( TRUE );
6527     KA_TRACE( 10, ("__kmp_do_serial_initialize  T#%d\n", gtid ));
6528     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6529     KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6530 
6531     KMP_MB();       /* Flush all pending memory write invalidates.  */
6532 
6533     __kmp_common_initialize();
6534 
6535     #if KMP_OS_UNIX
6536         /* invoke the child fork handler */
6537         __kmp_register_atfork();
6538     #endif
6539 
6540     #if ! defined KMP_DYNAMIC_LIB
6541         {
6542             /* Invoke the exit handler when the program finishes, only for static library.
6543                For dynamic library, we already have _fini and DllMain.
6544              */
6545             int rc = atexit( __kmp_internal_end_atexit );
6546             if ( rc != 0 ) {
6547                 __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6548             }; // if
6549         }
6550     #endif
6551 
6552     #if KMP_HANDLE_SIGNALS
6553         #if KMP_OS_UNIX
6554             /* NOTE: make sure that this is called before the user installs
6555              *          their own signal handlers so that the user handlers
6556              *          are called first.  this way they can return false,
6557              *          not call our handler, avoid terminating the library,
6558              *          and continue execution where they left off. */
6559             __kmp_install_signals( FALSE );
6560         #endif /* KMP_OS_UNIX */
6561         #if KMP_OS_WINDOWS
6562             __kmp_install_signals( TRUE );
6563         #endif /* KMP_OS_WINDOWS */
6564     #endif
6565 
6566     /* we have finished the serial initialization */
6567     __kmp_init_counter ++;
6568 
6569     __kmp_init_serial = TRUE;
6570 
6571     if (__kmp_settings) {
6572         __kmp_env_print();
6573     }
6574 
6575 #if OMP_40_ENABLED
6576     if (__kmp_display_env || __kmp_display_env_verbose) {
6577         __kmp_env_print_2();
6578     }
6579 #endif // OMP_40_ENABLED
6580 
6581 #if OMPT_SUPPORT
6582     ompt_post_init();
6583 #endif
6584 
6585     KMP_MB();
6586 
6587     KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6588 }
6589 
6590 void
6591 __kmp_serial_initialize( void )
6592 {
6593     if ( __kmp_init_serial ) {
6594         return;
6595     }
6596     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6597     if ( __kmp_init_serial ) {
6598         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6599         return;
6600     }
6601     __kmp_do_serial_initialize();
6602     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6603 }
6604 
6605 static void
6606 __kmp_do_middle_initialize( void )
6607 {
6608     int i, j;
6609     int prev_dflt_team_nth;
6610 
6611     if( !__kmp_init_serial ) {
6612         __kmp_do_serial_initialize();
6613     }
6614 
6615     KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6616 
6617     //
6618     // Save the previous value for the __kmp_dflt_team_nth so that
6619     // we can avoid some reinitialization if it hasn't changed.
6620     //
6621     prev_dflt_team_nth = __kmp_dflt_team_nth;
6622 
6623 #if KMP_AFFINITY_SUPPORTED
6624     //
6625     // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6626     // number of cores on the machine.
6627     //
6628     __kmp_affinity_initialize();
6629 
6630     //
6631     // Run through the __kmp_threads array and set the affinity mask
6632     // for each root thread that is currently registered with the RTL.
6633     //
6634     for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6635         if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6636             __kmp_affinity_set_init_mask( i, TRUE );
6637         }
6638     }
6639 #endif /* KMP_AFFINITY_SUPPORTED */
6640 
6641     KMP_ASSERT( __kmp_xproc > 0 );
6642     if ( __kmp_avail_proc == 0 ) {
6643         __kmp_avail_proc = __kmp_xproc;
6644     }
6645 
6646     // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6647     j = 0;
6648     while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6649         __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6650         j++;
6651     }
6652 
6653     if ( __kmp_dflt_team_nth == 0 ) {
6654 #ifdef KMP_DFLT_NTH_CORES
6655         //
6656         // Default #threads = #cores
6657         //
6658         __kmp_dflt_team_nth = __kmp_ncores;
6659         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6660           __kmp_dflt_team_nth ) );
6661 #else
6662         //
6663         // Default #threads = #available OS procs
6664         //
6665         __kmp_dflt_team_nth = __kmp_avail_proc;
6666         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6667           __kmp_dflt_team_nth ) );
6668 #endif /* KMP_DFLT_NTH_CORES */
6669     }
6670 
6671     if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6672         __kmp_dflt_team_nth = KMP_MIN_NTH;
6673     }
6674     if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6675         __kmp_dflt_team_nth = __kmp_sys_max_nth;
6676     }
6677 
6678     //
6679     // There's no harm in continuing if the following check fails,
6680     // but it indicates an error in the previous logic.
6681     //
6682     KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6683 
6684     if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6685         //
6686         // Run through the __kmp_threads array and set the num threads icv
6687         // for each root thread that is currently registered with the RTL
6688         // (which has not already explicitly set its nthreads-var with a
6689         // call to omp_set_num_threads()).
6690         //
6691         for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6692             kmp_info_t *thread = __kmp_threads[ i ];
6693             if ( thread == NULL ) continue;
6694             if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6695 
6696             set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6697         }
6698     }
6699     KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6700       __kmp_dflt_team_nth) );
6701 
6702 #ifdef KMP_ADJUST_BLOCKTIME
6703     /* Adjust blocktime to zero if necessary */
6704     /* now that __kmp_avail_proc is set      */
6705     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6706         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6707         if ( __kmp_nth > __kmp_avail_proc ) {
6708             __kmp_zero_bt = TRUE;
6709         }
6710     }
6711 #endif /* KMP_ADJUST_BLOCKTIME */
6712 
6713     /* we have finished middle initialization */
6714     TCW_SYNC_4(__kmp_init_middle, TRUE);
6715 
6716     KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6717 }
6718 
6719 void
6720 __kmp_middle_initialize( void )
6721 {
6722     if ( __kmp_init_middle ) {
6723         return;
6724     }
6725     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6726     if ( __kmp_init_middle ) {
6727         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6728         return;
6729     }
6730     __kmp_do_middle_initialize();
6731     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6732 }
6733 
6734 void
6735 __kmp_parallel_initialize( void )
6736 {
6737     int gtid = __kmp_entry_gtid();      // this might be a new root
6738 
6739     /* synchronize parallel initialization (for sibling) */
6740     if( TCR_4(__kmp_init_parallel) ) return;
6741     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6742     if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6743 
6744     /* TODO reinitialization after we have already shut down */
6745     if( TCR_4(__kmp_global.g.g_done) ) {
6746         KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6747         __kmp_infinite_loop();
6748     }
6749 
6750     /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6751            would cause a deadlock.  So we call __kmp_do_serial_initialize directly.
6752     */
6753     if( !__kmp_init_middle ) {
6754         __kmp_do_middle_initialize();
6755     }
6756 
6757     /* begin initialization */
6758     KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6759     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6760 
6761 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6762     //
6763     // Save the FP control regs.
6764     // Worker threads will set theirs to these values at thread startup.
6765     //
6766     __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6767     __kmp_store_mxcsr( &__kmp_init_mxcsr );
6768     __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6769 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6770 
6771 #if KMP_OS_UNIX
6772 # if KMP_HANDLE_SIGNALS
6773     /*  must be after __kmp_serial_initialize  */
6774     __kmp_install_signals( TRUE );
6775 # endif
6776 #endif
6777 
6778     __kmp_suspend_initialize();
6779 
6780 #if defined(USE_LOAD_BALANCE)
6781     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6782         __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6783     }
6784 #else
6785     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6786         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6787     }
6788 #endif
6789 
6790     if ( __kmp_version ) {
6791         __kmp_print_version_2();
6792     }
6793 
6794     /* we have finished parallel initialization */
6795     TCW_SYNC_4(__kmp_init_parallel, TRUE);
6796 
6797     KMP_MB();
6798     KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6799 
6800     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6801 }
6802 
6803 
6804 /* ------------------------------------------------------------------------ */
6805 
6806 void
6807 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6808   kmp_team_t *team )
6809 {
6810     kmp_disp_t *dispatch;
6811 
6812     KMP_MB();
6813 
6814     /* none of the threads have encountered any constructs, yet. */
6815     this_thr->th.th_local.this_construct = 0;
6816 #if KMP_CACHE_MANAGE
6817     KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6818 #endif /* KMP_CACHE_MANAGE */
6819     dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6820     KMP_DEBUG_ASSERT( dispatch );
6821     KMP_DEBUG_ASSERT( team->t.t_dispatch );
6822     //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6823 
6824     dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
6825 #if OMP_45_ENABLED
6826     dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6827 #endif
6828     if( __kmp_env_consistency_check )
6829         __kmp_push_parallel( gtid, team->t.t_ident );
6830 
6831     KMP_MB();       /* Flush all pending memory write invalidates.  */
6832 }
6833 
6834 void
6835 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6836   kmp_team_t *team )
6837 {
6838     if( __kmp_env_consistency_check )
6839         __kmp_pop_parallel( gtid, team->t.t_ident );
6840 
6841     __kmp_finish_implicit_task(this_thr);
6842 }
6843 
6844 int
6845 __kmp_invoke_task_func( int gtid )
6846 {
6847     int          rc;
6848     int          tid      = __kmp_tid_from_gtid( gtid );
6849     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6850     kmp_team_t  *team     = this_thr->th.th_team;
6851 
6852     __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6853 #if USE_ITT_BUILD
6854     if ( __itt_stack_caller_create_ptr ) {
6855         __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6856     }
6857 #endif /* USE_ITT_BUILD */
6858 #if INCLUDE_SSC_MARKS
6859     SSC_MARK_INVOKING();
6860 #endif
6861 
6862 #if OMPT_SUPPORT
6863     void *dummy;
6864     void **exit_runtime_p;
6865     ompt_task_id_t my_task_id;
6866     ompt_parallel_id_t my_parallel_id;
6867 
6868     if (ompt_enabled) {
6869         exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6870             ompt_task_info.frame.exit_runtime_frame);
6871     } else {
6872         exit_runtime_p = &dummy;
6873     }
6874 
6875 #if OMPT_TRACE
6876     my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6877     my_parallel_id = team->t.ompt_team_info.parallel_id;
6878     if (ompt_enabled &&
6879         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6880         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6881             my_parallel_id, my_task_id);
6882     }
6883 #endif
6884 #endif
6885 
6886     {
6887         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6888         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6889         rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6890                                      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6891 #if OMPT_SUPPORT
6892                                      , exit_runtime_p
6893 #endif
6894                                      );
6895 #if OMPT_SUPPORT
6896         *exit_runtime_p = NULL;
6897 #endif
6898     }
6899 
6900 #if USE_ITT_BUILD
6901     if ( __itt_stack_caller_create_ptr ) {
6902         __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6903     }
6904 #endif /* USE_ITT_BUILD */
6905     __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6906 
6907     return rc;
6908 }
6909 
6910 #if OMP_40_ENABLED
6911 void
6912 __kmp_teams_master( int gtid )
6913 {
6914     // This routine is called by all master threads in teams construct
6915     kmp_info_t *thr = __kmp_threads[ gtid ];
6916     kmp_team_t *team = thr->th.th_team;
6917     ident_t     *loc =  team->t.t_ident;
6918     thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6919     KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6920     KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6921     KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6922                    gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6923     // Launch league of teams now, but not let workers execute
6924     // (they hang on fork barrier until next parallel)
6925 #if INCLUDE_SSC_MARKS
6926     SSC_MARK_FORKING();
6927 #endif
6928     __kmp_fork_call( loc, gtid, fork_context_intel,
6929             team->t.t_argc,
6930 #if OMPT_SUPPORT
6931             (void *)thr->th.th_teams_microtask,      // "unwrapped" task
6932 #endif
6933             (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6934             VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6935             NULL );
6936 #if INCLUDE_SSC_MARKS
6937     SSC_MARK_JOINING();
6938 #endif
6939 
6940     // AC: last parameter "1" eliminates join barrier which won't work because
6941     // worker threads are in a fork barrier waiting for more parallel regions
6942     __kmp_join_call( loc, gtid
6943 #if OMPT_SUPPORT
6944         , fork_context_intel
6945 #endif
6946         , 1 );
6947 }
6948 
6949 int
6950 __kmp_invoke_teams_master( int gtid )
6951 {
6952     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6953     kmp_team_t  *team     = this_thr->th.th_team;
6954     #if KMP_DEBUG
6955     if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6956         KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6957     #endif
6958     __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6959     __kmp_teams_master( gtid );
6960     __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6961     return 1;
6962 }
6963 #endif /* OMP_40_ENABLED */
6964 
6965 /* this sets the requested number of threads for the next parallel region
6966  * encountered by this team */
6967 /* since this should be enclosed in the forkjoin critical section it
6968  * should avoid race conditions with assymmetrical nested parallelism */
6969 
6970 void
6971 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6972 {
6973     kmp_info_t *thr = __kmp_threads[gtid];
6974 
6975     if( num_threads > 0 )
6976         thr->th.th_set_nproc = num_threads;
6977 }
6978 
6979 #if OMP_40_ENABLED
6980 
6981 /* this sets the requested number of teams for the teams region and/or
6982  * the number of threads for the next parallel region encountered  */
6983 void
6984 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6985 {
6986     kmp_info_t *thr = __kmp_threads[gtid];
6987     KMP_DEBUG_ASSERT(num_teams >= 0);
6988     KMP_DEBUG_ASSERT(num_threads >= 0);
6989 
6990     if( num_teams == 0 )
6991         num_teams = 1;    // default number of teams is 1.
6992     if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6993         if ( !__kmp_reserve_warn ) {
6994             __kmp_reserve_warn = 1;
6995             __kmp_msg(
6996                 kmp_ms_warning,
6997                 KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
6998                 KMP_HNT( Unset_ALL_THREADS ),
6999                 __kmp_msg_null
7000             );
7001         }
7002         num_teams = __kmp_max_nth;
7003     }
7004     // Set number of teams (number of threads in the outer "parallel" of the teams)
7005     thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7006 
7007     // Remember the number of threads for inner parallel regions
7008     if( num_threads == 0 ) {
7009         if( !TCR_4(__kmp_init_middle) )
7010             __kmp_middle_initialize();  // get __kmp_avail_proc calculated
7011         num_threads = __kmp_avail_proc / num_teams;
7012         if( num_teams * num_threads > __kmp_max_nth ) {
7013             // adjust num_threads w/o warning as it is not user setting
7014             num_threads = __kmp_max_nth / num_teams;
7015         }
7016     } else {
7017         if( num_teams * num_threads > __kmp_max_nth ) {
7018             int new_threads = __kmp_max_nth / num_teams;
7019             if ( !__kmp_reserve_warn ) { // user asked for too many threads
7020                 __kmp_reserve_warn = 1;  // that conflicts with OMP_THREAD_LIMIT
7021                 __kmp_msg(
7022                     kmp_ms_warning,
7023                     KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
7024                     KMP_HNT( Unset_ALL_THREADS ),
7025                     __kmp_msg_null
7026                 );
7027             }
7028             num_threads = new_threads;
7029         }
7030     }
7031     thr->th.th_teams_size.nth = num_threads;
7032 }
7033 
7034 
7035 //
7036 // Set the proc_bind var to use in the following parallel region.
7037 //
7038 void
7039 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7040 {
7041     kmp_info_t *thr = __kmp_threads[gtid];
7042     thr->th.th_set_proc_bind = proc_bind;
7043 }
7044 
7045 #endif /* OMP_40_ENABLED */
7046 
7047 /* Launch the worker threads into the microtask. */
7048 
7049 void
7050 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7051 {
7052     kmp_info_t *this_thr = __kmp_threads[gtid];
7053 
7054 #ifdef KMP_DEBUG
7055     int f;
7056 #endif /* KMP_DEBUG */
7057 
7058     KMP_DEBUG_ASSERT( team );
7059     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7060     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7061     KMP_MB();       /* Flush all pending memory write invalidates.  */
7062 
7063     team->t.t_construct = 0;          /* no single directives seen yet */
7064     team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7065 
7066     /* Reset the identifiers on the dispatch buffer */
7067     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7068     if ( team->t.t_max_nproc > 1 ) {
7069         int i;
7070         for (i = 0; i <  __kmp_dispatch_num_buffers; ++i) {
7071             team->t.t_disp_buffer[ i ].buffer_index = i;
7072 #if OMP_45_ENABLED
7073             team->t.t_disp_buffer[i].doacross_buf_idx = i;
7074 #endif
7075         }
7076     } else {
7077         team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7078 #if OMP_45_ENABLED
7079         team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7080 #endif
7081     }
7082 
7083     KMP_MB();       /* Flush all pending memory write invalidates.  */
7084     KMP_ASSERT( this_thr->th.th_team  ==  team );
7085 
7086 #ifdef KMP_DEBUG
7087     for( f=0 ; f<team->t.t_nproc ; f++ ) {
7088         KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7089                           team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7090     }
7091 #endif /* KMP_DEBUG */
7092 
7093     /* release the worker threads so they may begin working */
7094     __kmp_fork_barrier( gtid, 0 );
7095 }
7096 
7097 
7098 void
7099 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7100 {
7101     kmp_info_t *this_thr = __kmp_threads[gtid];
7102 
7103     KMP_DEBUG_ASSERT( team );
7104     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7105     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7106     KMP_MB();       /* Flush all pending memory write invalidates.  */
7107 
7108     /* Join barrier after fork */
7109 
7110 #ifdef KMP_DEBUG
7111     if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7112         __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7113         __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7114                      gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7115         __kmp_print_structure();
7116     }
7117     KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7118                      __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7119 #endif /* KMP_DEBUG */
7120 
7121     __kmp_join_barrier( gtid );  /* wait for everyone */
7122 
7123     KMP_MB();       /* Flush all pending memory write invalidates.  */
7124     KMP_ASSERT( this_thr->th.th_team  ==  team );
7125 }
7126 
7127 
7128 /* ------------------------------------------------------------------------ */
7129 /* ------------------------------------------------------------------------ */
7130 
7131 #ifdef USE_LOAD_BALANCE
7132 
7133 //
7134 // Return the worker threads actively spinning in the hot team, if we
7135 // are at the outermost level of parallelism.  Otherwise, return 0.
7136 //
7137 static int
7138 __kmp_active_hot_team_nproc( kmp_root_t *root )
7139 {
7140     int i;
7141     int retval;
7142     kmp_team_t *hot_team;
7143 
7144     if ( root->r.r_active ) {
7145         return 0;
7146     }
7147     hot_team = root->r.r_hot_team;
7148     if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7149         return hot_team->t.t_nproc - 1;  // Don't count master thread
7150     }
7151 
7152     //
7153     // Skip the master thread - it is accounted for elsewhere.
7154     //
7155     retval = 0;
7156     for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7157         if ( hot_team->t.t_threads[i]->th.th_active ) {
7158             retval++;
7159         }
7160     }
7161     return retval;
7162 }
7163 
7164 //
7165 // Perform an automatic adjustment to the number of
7166 // threads used by the next parallel region.
7167 //
7168 static int
7169 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7170 {
7171     int retval;
7172     int pool_active;
7173     int hot_team_active;
7174     int team_curr_active;
7175     int system_active;
7176 
7177     KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7178                 root, set_nproc ) );
7179     KMP_DEBUG_ASSERT( root );
7180     KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7181     KMP_DEBUG_ASSERT( set_nproc > 1 );
7182 
7183     if ( set_nproc == 1) {
7184         KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7185         return 1;
7186     }
7187 
7188     //
7189     // Threads that are active in the thread pool, active in the hot team
7190     // for this particular root (if we are at the outer par level), and
7191     // the currently executing thread (to become the master) are available
7192     // to add to the new team, but are currently contributing to the system
7193     // load, and must be accounted for.
7194     //
7195     pool_active = TCR_4(__kmp_thread_pool_active_nth);
7196     hot_team_active = __kmp_active_hot_team_nproc( root );
7197     team_curr_active = pool_active + hot_team_active + 1;
7198 
7199     //
7200     // Check the system load.
7201     //
7202     system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7203     KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7204       system_active, pool_active, hot_team_active ) );
7205 
7206     if ( system_active < 0 ) {
7207         //
7208         // There was an error reading the necessary info from /proc,
7209         // so use the thread limit algorithm instead.  Once we set
7210         // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7211         // we shouldn't wind up getting back here.
7212         //
7213         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7214         KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7215 
7216         //
7217         // Make this call behave like the thread limit algorithm.
7218         //
7219         retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7220           : root->r.r_hot_team->t.t_nproc);
7221         if ( retval > set_nproc ) {
7222             retval = set_nproc;
7223         }
7224         if ( retval < KMP_MIN_NTH ) {
7225             retval = KMP_MIN_NTH;
7226         }
7227 
7228         KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7229         return retval;
7230     }
7231 
7232     //
7233     // There is a slight delay in the load balance algorithm in detecting
7234     // new running procs.  The real system load at this instant should be
7235     // at least as large as the #active omp thread that are available to
7236     // add to the team.
7237     //
7238     if ( system_active < team_curr_active ) {
7239         system_active = team_curr_active;
7240     }
7241     retval = __kmp_avail_proc - system_active + team_curr_active;
7242     if ( retval > set_nproc ) {
7243         retval = set_nproc;
7244     }
7245     if ( retval < KMP_MIN_NTH ) {
7246         retval = KMP_MIN_NTH;
7247     }
7248 
7249     KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7250     return retval;
7251 } // __kmp_load_balance_nproc()
7252 
7253 #endif /* USE_LOAD_BALANCE */
7254 
7255 /* ------------------------------------------------------------------------ */
7256 /* ------------------------------------------------------------------------ */
7257 
7258 /* NOTE: this is called with the __kmp_init_lock held */
7259 void
7260 __kmp_cleanup( void )
7261 {
7262     int f;
7263 
7264     KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7265 
7266     if (TCR_4(__kmp_init_parallel)) {
7267 #if KMP_HANDLE_SIGNALS
7268         __kmp_remove_signals();
7269 #endif
7270         TCW_4(__kmp_init_parallel, FALSE);
7271     }
7272 
7273     if (TCR_4(__kmp_init_middle)) {
7274 #if KMP_AFFINITY_SUPPORTED
7275         __kmp_affinity_uninitialize();
7276 #endif /* KMP_AFFINITY_SUPPORTED */
7277         __kmp_cleanup_hierarchy();
7278         TCW_4(__kmp_init_middle, FALSE);
7279     }
7280 
7281     KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7282 
7283     if (__kmp_init_serial) {
7284         __kmp_runtime_destroy();
7285         __kmp_init_serial = FALSE;
7286     }
7287 
7288     for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7289         if ( __kmp_root[ f ] != NULL ) {
7290             __kmp_free( __kmp_root[ f ] );
7291             __kmp_root[ f ] = NULL;
7292         }
7293     }
7294     __kmp_free( __kmp_threads );
7295     // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7296     // freeing __kmp_root.
7297     __kmp_threads = NULL;
7298     __kmp_root    = NULL;
7299     __kmp_threads_capacity = 0;
7300 
7301 #if KMP_USE_DYNAMIC_LOCK
7302     __kmp_cleanup_indirect_user_locks();
7303 #else
7304     __kmp_cleanup_user_locks();
7305 #endif
7306 
7307     #if KMP_AFFINITY_SUPPORTED
7308         KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7309         __kmp_cpuinfo_file = NULL;
7310     #endif /* KMP_AFFINITY_SUPPORTED */
7311 
7312    #if KMP_USE_ADAPTIVE_LOCKS
7313    #if KMP_DEBUG_ADAPTIVE_LOCKS
7314        __kmp_print_speculative_stats();
7315    #endif
7316    #endif
7317     KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7318     __kmp_nested_nth.nth = NULL;
7319     __kmp_nested_nth.size = 0;
7320     __kmp_nested_nth.used = 0;
7321     KMP_INTERNAL_FREE( __kmp_nested_proc_bind.bind_types );
7322     __kmp_nested_proc_bind.bind_types = NULL;
7323     __kmp_nested_proc_bind.size = 0;
7324     __kmp_nested_proc_bind.used = 0;
7325 
7326     __kmp_i18n_catclose();
7327 
7328 #if KMP_STATS_ENABLED
7329     __kmp_stats_fini();
7330 #endif
7331 
7332     KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7333 }
7334 
7335 /* ------------------------------------------------------------------------ */
7336 /* ------------------------------------------------------------------------ */
7337 
7338 int
7339 __kmp_ignore_mppbeg( void )
7340 {
7341     char *env;
7342 
7343     if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7344         if (__kmp_str_match_false( env ))
7345             return FALSE;
7346     }
7347     // By default __kmpc_begin() is no-op.
7348     return TRUE;
7349 }
7350 
7351 int
7352 __kmp_ignore_mppend( void )
7353 {
7354     char *env;
7355 
7356     if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7357         if (__kmp_str_match_false( env ))
7358             return FALSE;
7359     }
7360     // By default __kmpc_end() is no-op.
7361     return TRUE;
7362 }
7363 
7364 void
7365 __kmp_internal_begin( void )
7366 {
7367     int gtid;
7368     kmp_root_t *root;
7369 
7370     /* this is a very important step as it will register new sibling threads
7371      * and assign these new uber threads a new gtid */
7372     gtid = __kmp_entry_gtid();
7373     root = __kmp_threads[ gtid ]->th.th_root;
7374     KMP_ASSERT( KMP_UBER_GTID( gtid ));
7375 
7376     if( root->r.r_begin ) return;
7377     __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7378     if( root->r.r_begin ) {
7379         __kmp_release_lock( & root->r.r_begin_lock, gtid );
7380         return;
7381     }
7382 
7383     root->r.r_begin = TRUE;
7384 
7385     __kmp_release_lock( & root->r.r_begin_lock, gtid );
7386 }
7387 
7388 
7389 /* ------------------------------------------------------------------------ */
7390 /* ------------------------------------------------------------------------ */
7391 
7392 void
7393 __kmp_user_set_library (enum library_type arg)
7394 {
7395     int gtid;
7396     kmp_root_t *root;
7397     kmp_info_t *thread;
7398 
7399     /* first, make sure we are initialized so we can get our gtid */
7400 
7401     gtid = __kmp_entry_gtid();
7402     thread = __kmp_threads[ gtid ];
7403 
7404     root = thread->th.th_root;
7405 
7406     KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7407     if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7408         KMP_WARNING( SetLibraryIncorrectCall );
7409         return;
7410     }
7411 
7412     switch ( arg ) {
7413     case library_serial :
7414         thread->th.th_set_nproc = 0;
7415         set__nproc( thread, 1 );
7416         break;
7417     case library_turnaround :
7418         thread->th.th_set_nproc = 0;
7419         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7420         break;
7421     case library_throughput :
7422         thread->th.th_set_nproc = 0;
7423         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7424         break;
7425     default:
7426         KMP_FATAL( UnknownLibraryType, arg );
7427     }
7428 
7429     __kmp_aux_set_library ( arg );
7430 }
7431 
7432 void
7433 __kmp_aux_set_stacksize( size_t arg )
7434 {
7435     if (! __kmp_init_serial)
7436         __kmp_serial_initialize();
7437 
7438 #if KMP_OS_DARWIN
7439     if (arg & (0x1000 - 1)) {
7440         arg &= ~(0x1000 - 1);
7441         if(arg + 0x1000) /* check for overflow if we round up */
7442             arg += 0x1000;
7443     }
7444 #endif
7445     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7446 
7447     /* only change the default stacksize before the first parallel region */
7448     if (! TCR_4(__kmp_init_parallel)) {
7449         size_t value = arg;       /* argument is in bytes */
7450 
7451         if (value < __kmp_sys_min_stksize )
7452             value = __kmp_sys_min_stksize ;
7453         else if (value > KMP_MAX_STKSIZE)
7454             value = KMP_MAX_STKSIZE;
7455 
7456         __kmp_stksize = value;
7457 
7458         __kmp_env_stksize = TRUE;    /* was KMP_STACKSIZE specified? */
7459     }
7460 
7461     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7462 }
7463 
7464 /* set the behaviour of the runtime library */
7465 /* TODO this can cause some odd behaviour with sibling parallelism... */
7466 void
7467 __kmp_aux_set_library (enum library_type arg)
7468 {
7469     __kmp_library = arg;
7470 
7471     switch ( __kmp_library ) {
7472     case library_serial :
7473         {
7474             KMP_INFORM( LibraryIsSerial );
7475             (void) __kmp_change_library( TRUE );
7476         }
7477         break;
7478     case library_turnaround :
7479         (void) __kmp_change_library( TRUE );
7480         break;
7481     case library_throughput :
7482         (void) __kmp_change_library( FALSE );
7483         break;
7484     default:
7485         KMP_FATAL( UnknownLibraryType, arg );
7486     }
7487 }
7488 
7489 /* ------------------------------------------------------------------------ */
7490 /* ------------------------------------------------------------------------ */
7491 
7492 void
7493 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7494 {
7495     int blocktime = arg;        /* argument is in milliseconds */
7496 #if KMP_USE_MONITOR
7497     int bt_intervals;
7498 #endif
7499     int bt_set;
7500 
7501     __kmp_save_internal_controls( thread );
7502 
7503     /* Normalize and set blocktime for the teams */
7504     if (blocktime < KMP_MIN_BLOCKTIME)
7505         blocktime = KMP_MIN_BLOCKTIME;
7506     else if (blocktime > KMP_MAX_BLOCKTIME)
7507         blocktime = KMP_MAX_BLOCKTIME;
7508 
7509     set__blocktime_team( thread->th.th_team, tid, blocktime );
7510     set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7511 
7512 #if KMP_USE_MONITOR
7513     /* Calculate and set blocktime intervals for the teams */
7514     bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7515 
7516     set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7517     set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7518 #endif
7519 
7520     /* Set whether blocktime has been set to "TRUE" */
7521     bt_set = TRUE;
7522 
7523     set__bt_set_team( thread->th.th_team, tid, bt_set );
7524     set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7525 #if KMP_USE_MONITOR
7526     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7527                   "bt_intervals=%d, monitor_updates=%d\n",
7528                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7529                   thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7530                   __kmp_monitor_wakeups));
7531 #else
7532     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7533                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7534                   thread->th.th_team->t.t_id, tid, blocktime));
7535 #endif
7536 }
7537 
7538 void
7539 __kmp_aux_set_defaults(
7540     char const * str,
7541     int          len
7542 ) {
7543     if ( ! __kmp_init_serial ) {
7544         __kmp_serial_initialize();
7545     };
7546     __kmp_env_initialize( str );
7547 
7548     if (__kmp_settings
7549 #if OMP_40_ENABLED
7550         || __kmp_display_env || __kmp_display_env_verbose
7551 #endif // OMP_40_ENABLED
7552         ) {
7553         __kmp_env_print();
7554     }
7555 } // __kmp_aux_set_defaults
7556 
7557 /* ------------------------------------------------------------------------ */
7558 
7559 /*
7560  * internal fast reduction routines
7561  */
7562 
7563 PACKED_REDUCTION_METHOD_T
7564 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7565         kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7566         kmp_critical_name *lck )
7567 {
7568 
7569     // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7570     // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7571     // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7572     // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7573 
7574     PACKED_REDUCTION_METHOD_T retval;
7575 
7576     int team_size;
7577 
7578     KMP_DEBUG_ASSERT( loc );    // it would be nice to test ( loc != 0 )
7579     KMP_DEBUG_ASSERT( lck );    // it would be nice to test ( lck != 0 )
7580 
7581     #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7582     #define FAST_REDUCTION_TREE_METHOD_GENERATED   ( ( reduce_data ) && ( reduce_func ) )
7583 
7584     retval = critical_reduce_block;
7585 
7586     team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7587 
7588     if( team_size == 1 ) {
7589 
7590         retval = empty_reduce_block;
7591 
7592     } else {
7593 
7594         int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7595         int tree_available   = FAST_REDUCTION_TREE_METHOD_GENERATED;
7596 
7597         #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7598 
7599             #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7600 
7601 	    int teamsize_cutoff = 4;
7602 
7603 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7604                 if( __kmp_mic_type != non_mic ) {
7605                     teamsize_cutoff = 8;
7606                 }
7607 #endif
7608                 if( tree_available ) {
7609                     if( team_size <= teamsize_cutoff ) {
7610                         if ( atomic_available ) {
7611                             retval = atomic_reduce_block;
7612                         }
7613                     } else {
7614                         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7615                     }
7616                 } else if ( atomic_available ) {
7617                     retval = atomic_reduce_block;
7618                 }
7619             #else
7620                 #error "Unknown or unsupported OS"
7621             #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7622 
7623         #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7624 
7625             #if KMP_OS_LINUX || KMP_OS_WINDOWS
7626 
7627                 // basic tuning
7628 
7629                 if( atomic_available ) {
7630                     if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7631                         retval = atomic_reduce_block;
7632                     }
7633                 } // otherwise: use critical section
7634 
7635             #elif KMP_OS_DARWIN
7636 
7637                 if( atomic_available && ( num_vars <= 3 ) ) {
7638                         retval = atomic_reduce_block;
7639                 } else if( tree_available ) {
7640                     if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7641                         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7642                     }
7643                 } // otherwise: use critical section
7644 
7645             #else
7646                 #error "Unknown or unsupported OS"
7647             #endif
7648 
7649         #else
7650             #error "Unknown or unsupported architecture"
7651         #endif
7652 
7653     }
7654 
7655     // KMP_FORCE_REDUCTION
7656 
7657     // If the team is serialized (team_size == 1), ignore the forced reduction
7658     // method and stay with the unsynchronized method (empty_reduce_block)
7659     if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7660 
7661         PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7662 
7663         int atomic_available, tree_available;
7664 
7665         switch( ( forced_retval = __kmp_force_reduction_method ) )
7666         {
7667         case critical_reduce_block:
7668                 KMP_ASSERT( lck );              // lck should be != 0
7669                 break;
7670 
7671             case atomic_reduce_block:
7672                 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7673                 if( ! atomic_available ) {
7674                     KMP_WARNING(RedMethodNotSupported, "atomic");
7675                     forced_retval = critical_reduce_block;
7676                 }
7677                 break;
7678 
7679             case tree_reduce_block:
7680                 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7681                 if( ! tree_available ) {
7682                     KMP_WARNING(RedMethodNotSupported, "tree");
7683                     forced_retval = critical_reduce_block;
7684                 } else {
7685                     #if KMP_FAST_REDUCTION_BARRIER
7686                     forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7687                     #endif
7688                 }
7689                 break;
7690 
7691             default:
7692                 KMP_ASSERT( 0 ); // "unsupported method specified"
7693         }
7694 
7695         retval = forced_retval;
7696     }
7697 
7698     KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7699 
7700     #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7701     #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7702 
7703     return ( retval );
7704 }
7705 
7706 // this function is for testing set/get/determine reduce method
7707 kmp_int32
7708 __kmp_get_reduce_method( void ) {
7709     return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7710 }
7711 
7712 /* ------------------------------------------------------------------------ */
7713